1from __future__ import absolute_import, print_function, division
2from six import integer_types
3
4import theano
5from theano import Apply, Op
6
7from theano.compile import optdb
8from theano.gof import LocalOptGroup, ParamsType
9from theano.scalar import bool as bool_t
10from theano.tensor.basic import as_tensor_variable
11from theano.tensor.opt import in2out
12
13from .basic_ops import (GpuArrayType, CGpuKernelBase,
14                        as_gpuarray_variable, gpu_contiguous, infer_context_name, gpuarray_helper_inc_dir)
15from .opt_util import inplace_allocempty
16
17try:
18    import pygpu
19    from pygpu import blas
20except ImportError as e:
21    # To make sure theano is importable
22    pass
23
24
25class BlasOp(Op):
26    def c_headers(self):
27        return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']
28
29    def c_header_dirs(self):
30        return [pygpu.get_include(), gpuarray_helper_inc_dir()]
31
32    def c_init_code(self):
33        return ['import_pygpu__blas();']
34
35
36class GpuGemv(BlasOp):
37    """
38    Gemv on the GPU.
39
40    """
41    params_type = ParamsType(inplace=bool_t)
42    __props__ = ('inplace',)
43
44    def __init__(self, inplace=False):
45        self.inplace = inplace
46        if self.inplace:
47            self.destroy_map = {0: [0]}
48
49    def make_node(self, y, alpha, A, x, beta):
50        ctx_name = infer_context_name(y, A, x)
51        A = as_gpuarray_variable(A, ctx_name)
52        x = as_gpuarray_variable(x, ctx_name)
53        y = as_gpuarray_variable(y, ctx_name)
54        alpha = as_tensor_variable(alpha)
55        beta = as_tensor_variable(beta)
56
57        assert alpha.ndim == 0
58        assert beta.ndim == 0
59        assert A.ndim == 2
60        assert x.ndim == 1
61        assert y.ndim == 1
62        assert A.dtype == x.dtype == y.dtype
63
64        # float16 not supported
65        expected = A.dtype
66        assert theano.scalar.upcast(alpha.dtype,
67                                    beta.dtype, expected) == expected
68        alpha = alpha.astype(expected)
69        beta = beta.astype(expected)
70        return Apply(self, [y, alpha, A, x, beta], [y.type()])
71
72    def perform(self, node, inputs, out_storage, params):
73        y, alpha, A, x, beta = inputs
74        inplace = params.inplace
75        if inplace and y.strides[0] < 0:
76            inplace = False
77        if A.shape[1] == 0:
78            out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype,
79                                            context=y.context)
80        else:
81            out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
82                                          overwrite_y=inplace)
83
84    def c_code(self, node, name, inp, out, sub):
85        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
86                    beta=inp[4], fail=sub['fail'], name=name,
87                    params=sub['params'])
88        code = """
89               if (!%(params)s->inplace || %(y)s->ga.strides[0] <= 0) {
90                 %(out)s = theano_try_copy(%(out)s, %(y)s);
91                 if (%(out)s == NULL) {
92                   %(fail)s
93                 }
94               } else {
95                 Py_XDECREF(%(out)s);
96                 %(out)s = %(y)s;
97                 Py_INCREF(%(out)s);
98               }
99               """ % vars
100        # in case of possible speed up using blas dot,
101        # temporary hack A to 1D for vector-vector dot
102        code += """
103        if (PyGpuArray_DIM(%(A)s, 1) == 0) {
104          int code;
105          code = GpuArray_memset(&%(out)s->ga, 0);
106          if (code != GA_NO_ERROR) {
107            PyErr_SetString(PyExc_RuntimeError, "Memset failed");
108            %(fail)s
109          }
110        } else if ( PyGpuArray_DIM(%(A)s, 0) == 1
111         &&((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0] == (dtype_%(alpha)s)1.
112         &&((dtype_%(beta)s*)PyArray_DATA(%(beta)s))[0] == (dtype_%(beta)s)0.
113        ) {
114            %(out)s->ga.nd = 0;
115            %(A)s->ga.nd = 1;
116            %(A)s->ga.dimensions[0] = %(A)s->ga.dimensions[1];
117            ssize_t a_stride0 = %(A)s->ga.strides[0];
118            %(A)s->ga.strides[0] = %(A)s->ga.strides[1];
119            if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
120                %(fail)s
121            }
122            %(A)s->ga.strides[0] = a_stride0;
123            %(out)s->ga.nd = 1;
124            %(A)s->ga.nd = 2;
125            %(A)s->ga.dimensions[0] = 1;
126        } else if (
127            pygpu_blas_rgemv(cb_no_trans,
128            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
129            %(A)s, %(x)s,
130            ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
131            %(out)s, 0) == -1) {
132            %(fail)s
133        }
134        """ % vars
135
136        return code
137
138    def c_code_cache_version(self):
139        return (10,)
140
141gpugemv_no_inplace = GpuGemv(inplace=False)
142gpugemv_inplace = GpuGemv(inplace=True)
143
144
145class GpuGemm(BlasOp):
146    """
147    Gemm on the GPU.
148
149    """
150    params_type = ParamsType(inplace=bool_t)
151    __props__ = ('inplace',)
152    _f16_ok = True
153
154    def __init__(self, inplace=False):
155        self.inplace = inplace
156        if self.inplace:
157            self.destroy_map = {0: [0]}
158
159    def make_node(self, C, alpha, A, B, beta):
160        ctx_name = infer_context_name(C, A, B)
161        A = as_gpuarray_variable(A, ctx_name)
162        B = as_gpuarray_variable(B, ctx_name)
163        C = as_gpuarray_variable(C, ctx_name)
164        alpha = as_tensor_variable(alpha)
165        beta = as_tensor_variable(beta)
166
167        if not (A.dtype == B.dtype == C.dtype):
168            raise TypeError(theano.tensor.blas.Gemm.E_mixed,
169                            (A.dtype, B.dtype, C.dtype,
170                             alpha.dtype, beta.dtype))
171        if not A.dtype.startswith('float'):
172            raise TypeError(theano.tensor.blas.Gemm.E_float, (A.dtype))
173
174        if A.dtype == 'float16':
175            expected = 'float32'
176        else:
177            expected = A.dtype
178        assert theano.scalar.upcast(alpha.dtype,
179                                    beta.dtype, expected) == expected
180        alpha = alpha.astype(expected)
181        beta = beta.astype(expected)
182
183        assert alpha.ndim == 0
184        assert beta.ndim == 0
185        assert A.ndim == 2
186        assert B.ndim == 2
187        assert C.ndim == 2
188        return Apply(self, [C, alpha, A, B, beta], [C.type()])
189
190    def perform(self, node, inputs, outputs, params):
191        C, alpha, A, B, beta = inputs
192        inplace = params.inplace
193        if inplace and not C.flags.forc:
194            inplace = False
195        outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
196                                  overwrite_c=inplace)
197
198    def c_code(self, node, name, inp, out, sub):
199        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
200                    beta=inp[4], fail=sub['fail'], name=name,
201                    params=sub['params'])
202        code = """
203               if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(C)s->ga)) {
204                 %(out)s = theano_try_copy(%(out)s, %(C)s);
205                 if (%(out)s == NULL) {
206                   %(fail)s
207                 }
208               } else {
209                 Py_XDECREF(%(out)s);
210                 %(out)s = %(C)s;
211                 Py_INCREF(%(out)s);
212               }
213               if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
214                                    ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
215                                    %(A)s, %(B)s,
216                                    ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
217                                    %(out)s, 0) == -1) {
218                 %(fail)s
219               }
220        """ % vars
221
222        return code
223
224    def c_code_cache_version(self):
225        return (7,)
226
227gpugemm_no_inplace = GpuGemm(inplace=False)
228gpugemm_inplace = GpuGemm(inplace=True)
229
230
231class GpuGer(BlasOp):
232    """
233    Ger on the GPU.
234
235    """
236    params_type = ParamsType(inplace=bool_t)
237    __props__ = ('inplace',)
238
239    def __init__(self, inplace=False):
240        self.inplace = inplace
241        if self.inplace:
242            self.destroy_map = {0: [0]}
243
244    def make_node(self, A, alpha, x, y):
245        ctx_name = infer_context_name(A, x, y)
246        A = as_gpuarray_variable(A, ctx_name)
247        x = as_gpuarray_variable(x, ctx_name)
248        y = as_gpuarray_variable(y, ctx_name)
249        alpha = as_tensor_variable(alpha)
250        if not(A.dtype == x.dtype == y.dtype):
251            raise TypeError('ger requires matching dtypes',
252                            (A.dtype, alpha.dtype, x.dtype, y.dtype))
253
254        assert theano.scalar.upcast(alpha.dtype, A.dtype) == A.dtype
255        alpha = alpha.astype(A.dtype)
256        assert alpha.ndim == 0
257        assert A.ndim == 2
258        assert x.ndim == 1
259        assert y.ndim == 1
260        return Apply(self, [A, alpha, x, y], [A.type()])
261
262    def perform(self, node, inp, out, params):
263        A, alpha, x, y = inp
264        inplace = params.inplace
265        if inplace and not A.flags.forc:
266            inplace = False
267        out[0][0] = blas.ger(alpha, x, y, A,
268                             overwrite_a=inplace)
269
270    def c_code(self, node, name, inp, out, sub):
271        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
272                    fail=sub['fail'], name=name, params=sub['params'])
273        code = """
274               if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(A)s->ga)) {
275                 %(out)s = theano_try_copy(%(out)s, %(A)s);
276                 if (%(out)s == NULL) {
277                   %(fail)s
278                 }
279               } else {
280                 Py_XDECREF(%(out)s);
281                 %(out)s = %(A)s;
282                 Py_INCREF(%(out)s);
283               }
284               if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
285                                %(x)s, %(y)s, %(out)s, 0) == -1) {
286                 %(fail)s
287               }
288               """ % vars
289
290        return code
291
292    def c_code_cache_version(self):
293        return (5,)
294
295
296gpuger_no_inplace = GpuGer(inplace=False)
297gpuger_inplace = GpuGer(inplace=True)
298
299
300class GpuDot22(BlasOp):
301    """
302    Dot22 on the GPU.
303
304    """
305    _f16_ok = True
306    __props__ = ()
307
308    def make_node(self, x, y):
309        ctx_name = infer_context_name(x, y)
310        x = as_gpuarray_variable(x, ctx_name)
311        y = as_gpuarray_variable(y, ctx_name)
312        assert x.ndim == 2
313        assert y.ndim == 2
314        assert x.dtype == y.dtype
315        otype = x.type.clone(
316            broadcastable=(x.type.broadcastable[0], y.type.broadcastable[1]))
317        return Apply(self, [x, y], [otype()])
318
319    def perform(self, node, inputs, outputs):
320        x, y = inputs
321
322        out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype,
323                          context=x.context)
324        outputs[0][0] = blas.gemm(1., x, y, 0., out,
325                                  overwrite_c=True)
326
327    def c_code(self, node, name, inputs, outputs, sub):
328        dtype = node.inputs[0].dtype
329        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
330        vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0],
331                    typecode=typecode,
332                    fail=sub['fail'], name=name)
333        code = """
334        double one = 1.;
335        double zero = 0.;
336
337        size_t dims[] = {0, 0};
338        dims[0] = PyGpuArray_DIMS(%(A)s)[0];
339        dims[1] = PyGpuArray_DIMS(%(B)s)[1];
340
341        if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
342                               %(A)s->context)) {
343            %(fail)s
344        }
345
346        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
347                             one,
348                             %(A)s, %(B)s,
349                             zero,
350                             %(out)s, 0) == -1) {
351            %(fail)s
352        }
353        """ % vars
354
355        return code
356
357    def c_code_cache_version(self):
358        return (5,)
359
360gpu_dot22 = GpuDot22()
361
362
363class GpuGemmBatch(BlasOp):
364    params_type = ParamsType(inplace=bool_t)
365    __props__ = ('inplace',)
366    _f16_ok = True
367
368    def __init__(self, inplace=False):
369        self.inplace = inplace
370        if self.inplace:
371            self.destroy_map = {0: [0]}
372
373    def make_node(self, C, alpha, A, B, beta):
374        ctx_name = infer_context_name(C, A, B)
375        A = as_gpuarray_variable(A, ctx_name)
376        B = as_gpuarray_variable(B, ctx_name)
377        C = as_gpuarray_variable(C, ctx_name)
378        alpha = as_tensor_variable(alpha)
379        if alpha.dtype == 'float16':
380            alpha = alpha.astype('float32')
381        beta = as_tensor_variable(beta)
382        if beta.dtype == 'float16':
383            beta = beta.astype('float32')
384        assert alpha.ndim == 0
385        assert beta.ndim == 0
386        assert A.ndim == 3
387        assert B.ndim == 3
388        assert C.ndim == 3
389        assert A.dtype == B.dtype == C.dtype
390        if A.dtype in ('float32', 'float64'):
391            assert A.dtype == alpha.dtype == beta.dtype
392        else:
393            assert 'float32' == alpha.dtype == beta.dtype
394        return Apply(self, [C, alpha, A, B, beta], [C.type()])
395
396    def c_headers(self):
397        return super(GpuGemmBatch, self).c_headers() + ['<gpuarray/blas.h>']
398
399    def c_code(self, node, name, inp, out, sub):
400        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
401                    beta=inp[4], params=sub['params'],
402                    fail=sub['fail'], name=name)
403        code = """
404        int err;
405        if (%(params)s->inplace){
406                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
407                     %(out)s = theano_try_copy(%(out)s, %(C)s);
408                     if (%(out)s == NULL) {
409                       %(fail)s
410                     }
411                   } else {
412                     Py_XDECREF(%(out)s);
413                     %(out)s = %(C)s;
414                     Py_INCREF(%(out)s);
415                   }
416        } else {
417                   %(out)s = theano_try_copy(%(out)s, %(C)s);
418                   if (%(out)s == NULL) {
419                       %(fail)s
420                   }
421        }
422        err = GpuArray_rgemmBatch_3d(
423            cb_no_trans, cb_no_trans,
424            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
425            &%(A)s->ga, &%(B)s->ga,
426            ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
427            &%(out)s->ga, 0);
428        if (err != GA_NO_ERROR) {
429            PyErr_Format(PyExc_RuntimeError,
430                         "%%s", GpuArray_error(&%(A)s->ga, err));
431            %(fail)s;
432        }
433        """ % vars
434
435        return code
436
437    def c_code_cache_version(self):
438        return (4,)
439
440gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
441gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
442
443
444class BaseGpuCorrMM(CGpuKernelBase):
445    """
446    Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
447    `GpuCorrMM_gradInputs`. Cannot be used directly.
448
449    Parameters
450    ----------
451    border_mode : {'valid', 'full', 'half'}
452        Additionally, the padding size could be directly specified by an integer,
453        a pair of integers, or two pairs of integers.
454    subsample
455        Perform subsampling of the output (default: (1, 1)).
456    filter_dilation
457        Perform subsampling of the input, also known as dilation (default: (1, 1)).
458    num_groups :
459        Divides the image, kernel and output tensors into num_groups
460        separate groups. Each which carry out convolutions separately (default : 1).
461    unshared
462        Perform unshared correlation (default: False)
463    """
464    check_broadcast = False
465    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
466    _f16_ok = True
467
468    def __init__(self, border_mode="valid", subsample=(1, 1),
469                 filter_dilation=(1, 1), num_groups=1, unshared=False):
470        if isinstance(border_mode, integer_types):
471            if border_mode < 0:
472                raise ValueError(
473                    'invalid border_mode {}, which must be a '
474                    'non-negative integer'.format(border_mode))
475            border_mode = ((border_mode, border_mode),) * 2
476        elif isinstance(border_mode, tuple):
477            if len(border_mode) != 2:
478                raise ValueError(
479                    'invalid border_mode {} which must be a '
480                    'tuple of length 2'.format(border_mode))
481            border = ()
482            for mode in border_mode:
483                if isinstance(mode, tuple) and len(mode) == 2 and \
484                        min(mode) >= 0:
485                    border += ((int(mode[0]), int(mode[1])),)
486                elif mode >= 0:
487                    border += ((int(mode), int(mode)),)
488                else:
489                    raise ValueError(
490                        'invalid border mode {}. The tuple can only contain '
491                        'integers or tuples of length 2'.format(border_mode))
492            border_mode = border
493        elif border_mode not in ('valid', 'full', 'half'):
494            raise ValueError(
495                'invalid border_mode {}, which must be either '
496                '"valid", "full", "half", an integer or a tuple '
497                'of length 2'.format(border_mode))
498        self.border_mode = border_mode
499        if len(subsample) != 2:
500            raise ValueError("subsample must have two elements")
501        if len(filter_dilation) != 2:
502            raise ValueError("filter_dilation must have two elements")
503        self.subsample = tuple(subsample)
504        self.filter_dilation = tuple(filter_dilation)
505        if num_groups < 1:
506            raise ValueError("Number of groups should be greater than 0")
507        self.num_groups = num_groups
508        CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c'])
509        self.unshared = unshared
510
511    @property
512    def pad(self):
513        if self.border_mode != 'valid':
514            return self.border_mode
515        return ((0, 0),) * 2
516
517    def __str__(self):
518        return '%s{%s, %s, %s, %s, %s}' % (
519            self.__class__.__name__,
520            self.border_mode,
521            str(self.subsample),
522            str(self.filter_dilation),
523            str(self.num_groups),
524            str(self.unshared))
525
526    def __setstate__(self, d):
527        self.__dict__.update(d)
528        if not hasattr(self, 'num_groups'):
529            self.num_groups = 1
530
531    def flops(self, inp, outp):
532        """
533        Useful with the hack in profilemode to print the MFlops.
534
535        """
536        # if the output shape is correct, then this gives the correct
537        # flops for any direction, sampling, padding, and border mode
538        inputs, filters = inp
539        outputs, = outp
540        assert inputs[1] == (filters[1] * self.num_groups)
541        # nb mul and add by output pixel
542        flops = filters[2] * filters[3] * 2
543        # nb flops by output image
544        flops *= outputs[2] * outputs[3]
545        # nb patch multiplied
546        flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
547        return flops
548
549    def c_headers(self):
550        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
551
552    def c_header_dirs(self):
553        return [gpuarray_helper_inc_dir()]
554
555    def c_code_cache_version(self):
556        # Raise this whenever modifying the C code (including the file).
557        return (12,)
558
559    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
560        """
561        This generates the C code for GpuCorrMM (direction="forward"),
562        GpuCorrMM_gradWeights (direction="backprop weights"), and
563        GpuCorrMM_gradInputs (direction="backprop inputs").
564        Depending on the direction, one of bottom, weights, top will
565        receive the output, while the other two serve as inputs.
566
567        Parameters
568        ----------
569        bottom
570            Variable name of the input images in the forward pass,
571            or the gradient of the input images in backprop wrt. inputs
572        weights
573            Variable name of the filters in the forward pass,
574            or the gradient of the filters in backprop wrt. weights
575        top
576            Variable name of the output images / feature maps in the
577            forward pass, or the gradient of the outputs in the backprop passes
578        direction : {'forward', 'backprop weights', 'backprop inputs'}
579            "forward" to correlate bottom with weights and store results in top,
580            "backprop weights" to do a valid convolution of bottom with top
581            (swapping the first two dimensions) and store results in weights,
582            and "backprop inputs" to do a full convolution of top with weights
583            (swapping the first two dimensions) and store results in bottom.
584        sub
585            Dictionary of substitutions useable to help generating the C code.
586        height
587            Required if self.subsample[0] != 1, a variable giving the height of
588            the filters for direction="backprop weights" or the height of the
589            input images for direction="backprop inputs".
590            Required if self.border_mode == 'half', a variable giving the height
591            of the filters for direction="backprop weights".
592            Not required otherwise, but if a value is given this will be checked.
593        width
594            Required if self.subsample[1] != 1, a variable giving the width of
595            the filters for direction="backprop weights" or the width of the
596            input images for direction="backprop inputs".
597            Required if self.border_mode == 'half', a variable giving the width
598            of the filters for direction="backprop weights".
599            Not required otherwise, but if a value is given this will be checked.
600
601        """
602        dH, dW = self.subsample
603        dilH, dilW = self.filter_dilation
604        numgroups = self.num_groups
605        unshared = int(self.unshared)
606        if self.border_mode == "half":
607            padH_l = padH_r = padW_l = padW_r = -1
608        elif self.border_mode == "full":
609            padH_l = padH_r = padW_l = padW_r = -2
610        elif isinstance(self.border_mode, tuple):
611            (padH_l, padH_r), (padW_l, padW_r) = self.border_mode
612        else:
613            assert self.border_mode == "valid"
614            padH_l = padH_r = padW_l = padW_r = 0
615        if direction == "forward":
616            direction = 0
617            out = top
618        elif direction == "backprop weights":
619            direction = 1
620            out = weights
621        elif direction == "backprop inputs":
622            direction = 2
623            out = bottom
624        else:
625            raise ValueError("direction must be one of 'forward', "
626                             "'backprop weights', 'backprop inputs'")
627        # When subsampling, we cannot unambiguously infer the height and width
628        # of bottom and weights from top, so we require them to be given.
629        # Similarly, when pad="half", we cannot infer the weight size.
630        if height:
631            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
632        else:
633            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH_l == -1 or padH_r == -1)):
634                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
635            height = '-1'
636        if width:
637            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
638        else:
639            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW_l == -1 or padW_r == -1)):
640                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
641            width = '-1'
642
643        sub = sub.copy()
644        sub.update(locals())
645
646        return """
647    // Mandatory args
648    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
649
650    // Optional args
651    size_t dH = %(dH)s;
652    size_t dW = %(dW)s;
653    size_t dilH = %(dilH)s;
654    size_t dilW = %(dilW)s;
655    int padH_l = %(padH_l)s;
656    int padH_r = %(padH_r)s;
657    int padW_l = %(padW_l)s;
658    int padW_r = %(padW_r)s;
659    int numgroups = %(numgroups)s;
660    int unshared = %(unshared)s;
661
662    PyGpuArrayObject * bottom = %(bottom)s;
663    PyGpuArrayObject * weights = %(weights)s;
664    PyGpuArrayObject * top = %(top)s;
665    PyGpuArrayObject * out2 = NULL;
666
667    int wdim, odim;
668    wdim = unshared ? 6 : 4;
669    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
670
671    // Obtain or infer kernel width and height
672    // (we need to know it early to be able to handle auto-padding)
673    size_t kH, kW, dil_kH, dil_kW;
674    if (direction != 1) {
675        // weight is an input variable, we can just read its shape
676        kH = PyGpuArray_DIMS(weights)[wdim-2];
677        kW = PyGpuArray_DIMS(weights)[wdim-1];
678    }
679    else {
680        if (%(height)s != -1) {
681            // kernel height is specified (perhaps vertical subsampling or half padding)
682            kH = %(height)s;
683        }
684        else if (padH_l == -2 || padH_r == -2) {
685            // vertical full padding, we can infer the kernel height
686            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
687        }
688        else {
689            // explicit padding, we can infer the kernel height
690            kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
691        }
692        if (%(width)s != -1) {
693            kW = %(width)s;
694        }
695        else if (padW_l == -2 || padW_r == -2) {
696            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
697        }
698        else {
699            kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
700        }
701    }
702
703    // Implicit dilated kernel size
704    dil_kH = (kH - 1) * dilH + 1;
705    dil_kW = (kW - 1) * dilW + 1;
706
707    // Auto-padding if requested
708    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
709        padH_l = padH_r = dil_kH / 2;
710    }
711    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
712        padH_l = padH_r = dil_kH - 1;
713    }
714    else if (padH_l < 0 || padH_r < 0) {
715        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
716        %(fail)s
717    }
718    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
719        padW_l = padW_r = dil_kW / 2;
720    }
721    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
722        padW_l = padW_r = dil_kW - 1;
723    }
724    else if (padW_l < 0 || padW_r < 0) {
725        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
726        %(fail)s
727    }
728
729    // Infer output shape and type
730    // The inferred shape can be negative.
731    long long out_dim[6];
732    size_t out_dim_size[6];
733    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
734    out_dim_size[4] = out_dim_size[5] = 0; //Same
735    int out_typecode;
736    PyGpuContextObject *out_context;
737    switch(direction) {
738    case 0:  // forward pass
739        // output is top: (batchsize, num_filters, height, width)
740        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
741        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
742        out_dim[1] = PyGpuArray_DIMS(weights)[0];
743        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
744        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
745        out_typecode = bottom->ga.typecode;
746        out_context = bottom->context;
747        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
748        {
749            if (unshared) {
750                PyErr_Format(PyExc_ValueError,
751                             "GpuCorrMM: impossible output shape\\n"
752                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
753                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
754                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
755                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
756                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
757                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
758                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
759                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
760                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
761                %(fail)s
762            }
763            else {
764                PyErr_Format(PyExc_ValueError,
765                             "GpuCorrMM: impossible output shape\\n"
766                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
767                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
768                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
769                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
770                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
771                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
772                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
773                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
774                %(fail)s
775            }
776        }
777        break;
778    case 1:  // backprop wrt. weights
779        // output is weights: (num_filters, num_channels, height, width) or
780        // (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
781        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
782        out_dim[0] = PyGpuArray_DIMS(top)[1];
783        if (unshared){
784            odim = 6;
785            out_dim[1] = PyGpuArray_DIMS(top)[2];
786            out_dim[2] = PyGpuArray_DIMS(top)[3];
787        }
788        out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
789        out_dim[wdim-2] = kH;  // already inferred further above
790        out_dim[wdim-1] = kW;  // how convenient
791        out_typecode = top->ga.typecode;
792        out_context = top->context;
793        if (unshared) {
794            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
795                    || out_dim[4] <= 0 || out_dim[5] <= 0){
796                PyErr_Format(PyExc_ValueError,
797                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
798                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
799                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
800                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
801                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
802                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
803                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
804                             out_dim[4], out_dim[5],
805                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
806                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
807                %(fail)s
808            }
809        }
810        else {
811             if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
812            {
813                PyErr_Format(PyExc_ValueError,
814                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
815                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
816                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
817                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
818                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
819                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
820                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
821                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
822                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
823                %(fail)s
824            }
825        }
826        break;
827    case 2:  // backprop wrt. inputs
828        // output is bottom: (batchsize, num_channels, height, width)
829        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
830        out_dim[0] = PyGpuArray_DIMS(top)[0];
831        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
832        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
833        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
834        out_typecode = top->ga.typecode;
835        out_context = top->context;
836        if (unshared) {
837            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
838            {
839                PyErr_Format(PyExc_ValueError,
840                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
841                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
842                             "  weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
843                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
844                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
845                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
846                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
847                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
848                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
849                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
850                %(fail)s
851            }
852        }
853        else {
854            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
855            {
856                PyErr_Format(PyExc_ValueError,
857                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
858                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
859                             "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
860                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
861                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
862                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
863                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
864                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
865                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
866                %(fail)s
867            }
868        }
869        break;
870    default:
871        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
872        %(fail)s
873    }
874
875    out_dim_size[0] = (size_t)out_dim[0];
876    out_dim_size[1] = (size_t)out_dim[1];
877    out_dim_size[2] = (size_t)out_dim[2];
878    out_dim_size[3] = (size_t)out_dim[3];
879
880    if (odim == 6) {
881        out_dim_size[4] = (size_t)out_dim[4];
882        out_dim_size[5] = (size_t)out_dim[5];
883    }
884
885    // Prepare output array
886    if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
887    {
888        if (odim == 4) {
889            PyErr_Format(PyExc_RuntimeError,
890                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
891                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
892        }
893        if (odim == 6) {
894            PyErr_Format(PyExc_RuntimeError,
895                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
896                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
897        }
898        %(fail)s
899    }
900    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
901        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
902        %(fail)s
903    }
904
905    // Call GPU code
906    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
907                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
908    if (out2==NULL){
909       %(fail)s
910    }
911    assert (out2 == %(out)s);
912
913""" % sub
914
915
916class GpuCorrMM(BaseGpuCorrMM):
917    """
918    GPU correlation implementation using Matrix Multiplication.
919
920    Parameters
921    ----------
922    border_mode
923        The width of a border of implicit zeros to pad the
924        input with. Must be a tuple with 2 elements giving the numbers of rows
925        and columns to pad on each side, or a single integer to pad the same
926        on all sides, or a string shortcut setting the padding at runtime:
927        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
928        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
929        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
930        convolution for odd-sized kernels).
931        If it is a tuple containing 2 pairs of integers, then these specify
932        the padding to be applied on each side ((left, right), (top, bottom)).
933        Otherwise, each width is applied twice, once per side (left and right,
934        top and bottom).
935    subsample
936        The subsample operation applied to each output image.
937        Should be a tuple with 2 elements.
938        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
939        but faster.
940        Set to `(1, 1)` to disable subsampling.
941    filter_dilation
942        The filter dilation operation applied to each input image.
943        Should be a tuple with 2 elements.
944        Set to `(1, 1)` to disable filter dilation.
945    num_groups
946        The number of distinct groups the image and kernel must be
947        divided into.
948        should be an int
949        set to 1 to disable grouped convolution
950    unshared
951        Perform unshared correlation (default: False)
952
953    Notes
954    -----
955    Currently, the Op requires the inputs, filters and outputs to be
956    C-contiguous. Use :func:`gpu_contiguous
957    <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
958    if needed.
959
960    You can either enable the Theano flag `optimizer_including=conv_gemm`
961    to automatically replace all convolution operations with `GpuCorrMM`
962    or one of its gradients, or you can use it as a replacement for
963    :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
964    `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
965    faster, but note that it computes a correlation -- if you need to
966    compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
967
968    """
969    def __init__(self, border_mode="valid",
970                 subsample=(1, 1),
971                 filter_dilation=(1, 1), num_groups=1, unshared=False):
972        super(GpuCorrMM, self).__init__(border_mode, subsample,
973                                        filter_dilation, num_groups, unshared)
974
975    def make_node(self, img, kern):
976        ctx_name = infer_context_name(img, kern)
977        img = as_gpuarray_variable(img, ctx_name)
978        kern = as_gpuarray_variable(kern, ctx_name)
979        if img.type.ndim != 4:
980            raise TypeError('img must be 4D tensor')
981        if self.unshared:
982            if kern.type.ndim != 6:
983                raise TypeError('kern must be 6D tensor')
984        else:
985            if kern.type.ndim != 4:
986                raise TypeError('kern must be 4D tensor')
987
988        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
989                         False, False]
990        return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
991                                                      context_name=ctx_name,
992                                                      broadcastable=broadcastable)()])
993
994    def c_code(self, node, nodename, inp, out_, sub):
995        bottom, weights = inp
996        top, = out_
997        direction = "forward"
998        return super(GpuCorrMM, self).c_code_helper(bottom, weights, top, direction, sub)
999
1000    def grad(self, inp, grads):
1001        bottom, weights = inp
1002        top, = grads
1003        top = gpu_contiguous(top)
1004        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
1005                                        self.subsample,
1006                                        self.filter_dilation,
1007                                        self.num_groups,
1008                                        self.unshared)(
1009            weights, top, bottom.shape[-2:])
1010        d_weights = GpuCorrMM_gradWeights(self.border_mode,
1011                                          self.subsample,
1012                                          self.filter_dilation,
1013                                          self.num_groups,
1014                                          self.unshared)(
1015            bottom, top, weights.shape[-2:])
1016        return d_bottom, d_weights
1017
1018
1019class GpuCorrMM_gradWeights(BaseGpuCorrMM):
1020    """
1021    Gradient wrt. filters for `GpuCorrMM`.
1022
1023    Notes
1024    -----
1025    You will not want to use this directly, but rely on Theano's automatic
1026    differentiation or graph optimization to use it as needed.
1027
1028    """
1029
1030    def __init__(self, border_mode="valid",
1031                 subsample=(1, 1),
1032                 filter_dilation=(1, 1),
1033                 num_groups=1,
1034                 unshared=False):
1035        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
1036                                                    subsample,
1037                                                    filter_dilation, num_groups,
1038                                                    unshared)
1039
1040    def make_node(self, img, topgrad, shape=None):
1041        ctx_name = infer_context_name(img, topgrad)
1042        img = as_gpuarray_variable(img, ctx_name)
1043        topgrad = as_gpuarray_variable(topgrad, ctx_name)
1044        if img.type.ndim != 4:
1045            raise TypeError('img must be 4D tensor')
1046        if topgrad.type.ndim != 4:
1047            raise TypeError('topgrad must be 4D tensor')
1048        if shape is None:
1049            if self.subsample != (1, 1) or self.border_mode == "half":
1050                raise ValueError('shape must be given if subsample != (1, 1)'
1051                                 ' or border_mode == "half"')
1052            height_width = []
1053        else:
1054            height_width = [shape[0], shape[1]]
1055            assert shape[0].ndim == 0
1056            assert shape[1].ndim == 0
1057
1058        if self.unshared:
1059            broadcastable = [topgrad.type.broadcastable[1], False, False,
1060                             img.type.broadcastable[1], False, False]
1061        else:
1062            broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
1063                             False, False]
1064        return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
1065                                                                        context_name=ctx_name,
1066                                                                        broadcastable=broadcastable)()])
1067
1068    def c_code(self, node, nodename, inp, out_, sub):
1069        bottom, top = inp[:2]
1070        height, width = inp[2:] or (None, None)
1071        weights, = out_
1072        direction = "backprop weights"
1073        return super(GpuCorrMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
1074
1075    def grad(self, inp, grads):
1076        bottom, top = inp[:2]
1077        weights, = grads
1078        weights = gpu_contiguous(weights)
1079        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
1080                                        self.subsample,
1081                                        self.filter_dilation,
1082                                        self.num_groups,
1083                                        self.unshared)(weights,
1084                                                       top,
1085                                                       bottom.shape[-2:])
1086        d_top = GpuCorrMM(
1087            self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights)
1088        d_height_width = (
1089            theano.gradient.DisconnectedType()(),
1090            ) * 2 if len(inp) == 4 else ()
1091        return (d_bottom, d_top) + d_height_width
1092
1093    def connection_pattern(self, node):
1094        if node.nin == 2:
1095            return [[1], [1]]
1096        else:
1097            return [[1], [1], [0], [0]]  # no connection to height, width
1098
1099
1100class GpuCorrMM_gradInputs(BaseGpuCorrMM):
1101    """
1102    Gradient wrt. inputs for `GpuCorrMM`.
1103
1104    Notes
1105    -----
1106    You will not want to use this directly, but rely on Theano's automatic
1107    differentiation or graph optimization to use it as needed.
1108
1109    """
1110
1111    def __init__(self, border_mode="valid",
1112                 subsample=(1, 1),
1113                 filter_dilation=(1, 1),
1114                 num_groups=1,
1115                 unshared=False):
1116        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
1117                                                   filter_dilation, num_groups,
1118                                                   unshared)
1119
1120    def make_node(self, kern, topgrad, shape=None):
1121        ctx_name = infer_context_name(kern, topgrad)
1122        kern = as_gpuarray_variable(kern, ctx_name)
1123        topgrad = as_gpuarray_variable(topgrad, ctx_name)
1124        if self.unshared:
1125            if kern.type.ndim != 6:
1126                raise TypeError('kern must be 6D tensor')
1127        else:
1128            if kern.type.ndim != 4:
1129                raise TypeError('kern must be 4D tensor')
1130        if topgrad.type.ndim != 4:
1131            raise TypeError('topgrad must be 4D tensor')
1132        if shape is None:
1133            if self.subsample != (1, 1):
1134                raise ValueError('shape must be given if subsample != (1, 1)')
1135            height_width = []
1136        else:
1137            height_width = [shape[0], shape[1]]
1138            assert shape[0].ndim == 0
1139            assert shape[1].ndim == 0
1140
1141        if self.num_groups > 1:
1142            broadcastable = [topgrad.type.broadcastable[0], False,
1143                             False, False]
1144        else:
1145            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
1146                             False, False]
1147        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
1148                                                                         context_name=ctx_name,
1149                                                                         broadcastable=broadcastable)()])
1150
1151    def c_code(self, node, nodename, inp, out_, sub):
1152        weights, top = inp[:2]
1153        height, width = inp[2:] or (None, None)
1154        bottom, = out_
1155        direction = "backprop inputs"
1156        return super(GpuCorrMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
1157
1158    def grad(self, inp, grads):
1159        weights, top = inp[:2]
1160        bottom, = grads
1161        bottom = gpu_contiguous(bottom)
1162        d_weights = GpuCorrMM_gradWeights(self.border_mode,
1163                                          self.subsample,
1164                                          self.filter_dilation,
1165                                          self.num_groups,
1166                                          self.unshared)(bottom,
1167                                                         top,
1168                                                         weights.shape[-2:])
1169        d_top = GpuCorrMM(self.border_mode,
1170                          self.subsample,
1171                          self.filter_dilation,
1172                          self.num_groups,
1173                          self.unshared)(bottom, weights)
1174        d_height_width = (
1175            theano.gradient.DisconnectedType()(),
1176            ) * 2 if len(inp) == 4 else ()
1177        return (d_weights, d_top) + d_height_width
1178
1179    def connection_pattern(self, node):
1180        if node.nin == 2:
1181            return [[1], [1]]
1182        else:
1183            return [[1], [1], [0], [0]]  # no connection to height, width
1184
1185
1186class BaseGpuCorr3dMM(CGpuKernelBase):
1187    """
1188    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
1189    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
1190
1191    Parameters
1192    ----------
1193    border_mode : {'valid', 'full', 'half'}
1194        Additionally, the padding size could be directly specified by an integer
1195        or a pair of integers
1196    subsample
1197        Perform subsampling of the output (default: (1, 1, 1)).
1198    filter_dilation
1199        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
1200    num_groups :
1201        Divides the image, kernel and output tensors into num_groups
1202        separate groups. Each which carry out convolutions separately (default : 1).
1203
1204    """
1205    check_broadcast = False
1206    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
1207    _f16_ok = True
1208
1209    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
1210                 filter_dilation=(1, 1, 1), num_groups=1):
1211        if isinstance(border_mode, integer_types):
1212            border_mode = (border_mode, border_mode, border_mode)
1213        if isinstance(border_mode, tuple):
1214            pad_h, pad_w, pad_d = map(int, border_mode)
1215            border_mode = (pad_h, pad_w, pad_d)
1216        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
1217                border_mode in ('valid', 'full', 'half')):
1218            raise ValueError(
1219                'invalid border_mode {}, which must be either '
1220                '"valid", "full", "half", an integer or a tuple of'
1221                ' three integers'.format(border_mode))
1222        self.border_mode = border_mode
1223        if len(subsample) != 3:
1224            raise ValueError("subsample must have three elements")
1225        if len(filter_dilation) != 3:
1226            raise ValueError("filter_dilation must have three elements")
1227        self.subsample = tuple(subsample)
1228        self.filter_dilation = tuple(filter_dilation)
1229        if num_groups < 1:
1230            raise ValueError("Number of groups should be greater than 0")
1231        self.num_groups = num_groups
1232        CGpuKernelBase.__init__(self, ['c_code/corr3d_gemm.c'])
1233
1234    @property
1235    def pad(self):
1236        if self.border_mode != 'valid':
1237            return self.border_mode
1238        return (0, 0, 0)
1239
1240    def __str__(self):
1241        return '%s{%s, %s, %s, %s}' % (
1242            self.__class__.__name__,
1243            self.border_mode,
1244            str(self.subsample),
1245            str(self.filter_dilation),
1246            str(self.num_groups))
1247
1248    def __setstate__(self, d):
1249        self.__dict__.update(d)
1250        if not hasattr(self, 'num_groups'):
1251            self.num_groups = 1
1252
1253    def flops(self, inp, outp):
1254        """
1255        Useful with the hack in profilemode to print the MFlops.
1256
1257        """
1258        # if the output shape is correct, then this gives the correct
1259        # flops for any direction, sampling, padding, and border mode
1260        inputs, filters = inp
1261        outputs, = outp
1262        assert inputs[1] == (filters[1] * self.num_groups)
1263        # nb mul and add by output pixel
1264        flops = filters[2] * filters[3] * filters[4] * 2
1265        # nb flops by output image
1266        flops *= outputs[2] * outputs[3] * outputs[4]
1267        # nb patch multiplied
1268        flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
1269        return flops
1270
1271    def c_headers(self):
1272        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
1273
1274    def c_header_dirs(self):
1275        return [gpuarray_helper_inc_dir()]
1276
1277    def c_code_cache_version(self):
1278        # raise this whenever modifying the code below.
1279        return (8,)
1280
1281    def c_code_helper(self, bottom, weights, top, direction, sub,
1282                      height=None, width=None, depth=None):
1283        """
1284        This generates the C code for GpuCorr3dMM (direction="forward"),
1285        GpuCorr3dMM_gradWeights (direction="backprop weights"), and
1286        GpuCorr3dMM_gradInputs (direction="backprop inputs").
1287        Depending on the direction, one of bottom, weights, top will
1288        receive the output, while the other two serve as inputs.
1289
1290        Parameters
1291        ----------
1292        bottom
1293            Variable name of the input images in the forward pass,
1294            or the gradient of the input images in backprop wrt. inputs
1295        weights
1296            Variable name of the filters in the forward pass,
1297            or the gradient of the filters in backprop wrt. weights
1298        top
1299            Variable name of the output images / feature maps in the
1300            forward pass, or the gradient of the outputs in the backprop passes
1301        direction : {'forward', 'backprop weights', 'backprop inputs'}
1302            "forward" to correlate bottom with weights and store results in top,
1303            "backprop weights" to do a valid convolution of bottom with top
1304            (swapping the first two dimensions) and store results in weights,
1305            and "backprop inputs" to do a full convolution of top with weights
1306            (swapping the first two dimensions) and store results in bottom.
1307        sub
1308            Dictionary of substitutions useable to help generating the C code.
1309        height
1310            Required if self.subsample[0] != 1, a variable giving the height of
1311            the filters for direction="backprop weights" or the height of the
1312            input images for direction="backprop inputs".
1313            Required if self.border_mode == 'half', a variable giving the height
1314            of the filters for direction="backprop weights".
1315            Not required otherwise, but if a value is given this will be checked.
1316        width
1317            Required if self.subsample[1] != 1, a variable giving the width of
1318            the filters for direction="backprop weights" or the width of the
1319            input images for direction="backprop inputs".
1320            Required if self.border_mode == 'half', a variable giving the width
1321            of the filters for direction="backprop weights".
1322            Not required otherwise, but if a value is given this will be checked.
1323        depth
1324            Required if self.subsample[2] != 1, a variable giving the depth of
1325            the filters for direction="backprop weights" or the depth of the
1326            input images for direction="backprop inputs".
1327            Required if self.border_mode == 'half', a variable giving the depth
1328            of the filters for direction="backprop weights".
1329            Not required otherwise, but if a value is given this will be checked.
1330
1331        """
1332        dH, dW, dD = self.subsample
1333        dilH, dilW, dilD = self.filter_dilation
1334        numgroups = self.num_groups
1335        if self.border_mode == "half":
1336            padH = padW = padD = -1
1337        elif self.border_mode == "full":
1338            padH = padW = padD = -2
1339        elif isinstance(self.border_mode, tuple):
1340            padH, padW, padD = self.border_mode
1341        else:
1342            assert self.border_mode == "valid"
1343            padH = padW = padD = 0
1344        if direction == "forward":
1345            direction = 0
1346            out = top
1347        elif direction == "backprop weights":
1348            direction = 1
1349            out = weights
1350        elif direction == "backprop inputs":
1351            direction = 2
1352            out = bottom
1353        else:
1354            raise ValueError("direction must be one of 'forward', "
1355                             "'backprop weights', 'backprop inputs'")
1356        # When subsampling, we cannot unambiguously infer the height and width
1357        # of bottom and weights from top, so we require them to be given.
1358        # Similarly, when pad="half", we cannot infer the weight size.
1359        if height:
1360            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
1361        else:
1362            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
1363                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
1364            height = '-1'
1365        if width:
1366            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
1367        else:
1368            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
1369                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
1370            width = '-1'
1371        if depth:
1372            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
1373        else:
1374            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
1375                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
1376            depth = '-1'
1377
1378        sub = sub.copy()
1379        sub.update(locals())
1380
1381        return """
1382    // Mandatory args
1383    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
1384
1385    // Optional args
1386    size_t dH = %(dH)s;
1387    size_t dW = %(dW)s;
1388    size_t dD = %(dD)s;
1389    size_t dilH = %(dilH)s;
1390    size_t dilW = %(dilW)s;
1391    size_t dilD = %(dilD)s;
1392    int padH = %(padH)s;
1393    int padW = %(padW)s;
1394    int padD = %(padD)s;
1395    int numgroups = %(numgroups)s;
1396
1397    PyGpuArrayObject * bottom = %(bottom)s;
1398    PyGpuArrayObject * weights = %(weights)s;
1399    PyGpuArrayObject * top = %(top)s;
1400    PyGpuArrayObject * out2 = NULL;
1401
1402    // Obtain or infer kernel height, width and depth
1403    // (we need to know it early to be able to handle auto-padding)
1404    size_t kH, kW, kD, dil_kH, dil_kW, dil_kD;
1405    if (direction != 1) {
1406        // weight is an input variable, we can just read its shape
1407        kH = PyGpuArray_DIMS(weights)[2];
1408        kW = PyGpuArray_DIMS(weights)[3];
1409        kD = PyGpuArray_DIMS(weights)[4];
1410    }
1411    else {
1412        if (%(height)s != -1) {
1413            // kernel height is specified (perhaps vertical subsampling or half padding)
1414            kH = %(height)s;
1415        }
1416        else if (padH == -2) {
1417            // vertical full padding, we can infer the kernel height
1418            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
1419        }
1420        else {
1421            // explicit padding, we can infer the kernel height
1422            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
1423        }
1424        if (%(width)s != -1) {
1425            kW = %(width)s;
1426        }
1427        else if (padW == -2) {
1428            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
1429        }
1430        else {
1431            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
1432        }
1433        if (%(depth)s != -1) {
1434            kD = %(depth)s;
1435        }
1436        else if (padD == -2) {
1437            kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
1438        }
1439        else {
1440            kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
1441        }
1442    }
1443
1444    // Implicit dilated kernel size
1445    dil_kH = (kH - 1) * dilH + 1;
1446    dil_kW = (kW - 1) * dilW + 1;
1447    dil_kD = (kD - 1) * dilD + 1;
1448
1449    // Auto-padding if requested
1450    if (padH == -1) {  // vertical half padding
1451        padH = dil_kH / 2;
1452    }
1453    else if (padH == -2) {  // vertical full padding
1454        padH = dil_kH - 1;
1455    }
1456    else if (padH < 0) {
1457        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
1458        %(fail)s
1459    }
1460    if (padW == -1) {  // horizontal half padding
1461        padW = dil_kW / 2;
1462    }
1463    else if (padW == -2) {  // horizontal full padding
1464        padW = dil_kW - 1;
1465    }
1466    else if (padW < 0) {
1467        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
1468        %(fail)s
1469    }
1470    if (padD == -1) {  // depth half padding
1471        padD = dil_kD / 2;
1472    }
1473    else if (padD == -2) {  // depth full padding
1474        padD = dil_kD - 1;
1475    }
1476    else if (padD < 0) {
1477        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
1478        %(fail)s
1479    }
1480
1481    // Infer output shape and type
1482    // The inferred shape can be negative.
1483    long long out_dim[5];
1484    size_t out_dim_size[5];
1485    int out_typecode;
1486    PyGpuContextObject *out_context;
1487    switch(direction) {
1488    case 0:  // forward pass
1489        // output is top: (batchsize, num_filters, height, width, depth)
1490        // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
1491        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
1492        out_dim[1] = PyGpuArray_DIMS(weights)[0];
1493        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
1494        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
1495        out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
1496        out_typecode = bottom->ga.typecode;
1497        out_context = bottom->context;
1498        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
1499        {
1500            PyErr_Format(PyExc_ValueError,
1501                         "GpuCorr3dMM: impossible output shape\\n"
1502                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1503                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1504                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
1505                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
1506                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
1507                         PyGpuArray_DIMS(bottom)[4],
1508                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
1509                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
1510                         PyGpuArray_DIMS(weights)[4],
1511                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
1512            %(fail)s
1513        }
1514        break;
1515    case 1:  // backprop wrt. weights
1516        // output is weights: (num_filters, num_channels, height, width, depth)
1517        // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
1518        out_dim[0] = PyGpuArray_DIMS(top)[1];
1519        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
1520        out_dim[2] = kH;  // already inferred further above
1521        out_dim[3] = kW;  // how convenient
1522        out_dim[4] = kD;
1523        out_typecode = top->ga.typecode;
1524        out_context = top->context;
1525        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
1526        {
1527            PyErr_Format(PyExc_ValueError,
1528                         "GpuCorr3dMM backprop wrt. weights: impossible output shape\\n"
1529                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1530                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1531                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
1532                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
1533                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
1534                         PyGpuArray_DIMS(bottom)[4],
1535                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
1536                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
1537                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
1538                         PyGpuArray_DIMS(top)[4]);
1539            %(fail)s
1540        }
1541        break;
1542    case 2:  // backprop wrt. inputs
1543        // output is bottom: (batchsize, num_channels, height, width, depth)
1544        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
1545        out_dim[0] = PyGpuArray_DIMS(top)[0];
1546        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
1547        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
1548        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
1549        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
1550        out_typecode = top->ga.typecode;
1551        out_context = top->context;
1552        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
1553        {
1554            PyErr_Format(PyExc_ValueError,
1555                         "GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n"
1556                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1557                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
1558                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
1559                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
1560                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
1561                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
1562                         PyGpuArray_DIMS(weights)[4],
1563                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
1564                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
1565                         PyGpuArray_DIMS(top)[4]);
1566            %(fail)s
1567        }
1568        break;
1569    default:
1570        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
1571        %(fail)s
1572    }
1573
1574    out_dim_size[0] = (size_t)out_dim[0];
1575    out_dim_size[1] = (size_t)out_dim[1];
1576    out_dim_size[2] = (size_t)out_dim[2];
1577    out_dim_size[3] = (size_t)out_dim[3];
1578    out_dim_size[4] = (size_t)out_dim[4];
1579
1580    // Prepare output array
1581    if (theano_prep_output(&%(out)s, 5, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
1582    {
1583        PyErr_Format(PyExc_RuntimeError,
1584                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
1585                out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
1586        %(fail)s
1587    }
1588    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
1589        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
1590        %(fail)s
1591    }
1592
1593    // Call GPU code
1594    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
1595                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups);
1596    if (out2==NULL){
1597       %(fail)s
1598    }
1599    assert (out2 == %(out)s);
1600
1601""" % sub
1602
1603
1604class GpuCorr3dMM(BaseGpuCorr3dMM):
1605    """
1606    GPU correlation implementation using Matrix Multiplication.
1607
1608    Parameters
1609    ----------
1610    border_mode
1611        The width of a border of implicit zeros to pad the
1612        input with. Must be a tuple with 3 elements giving the width of
1613        the padding on each side, or a single integer to pad the same
1614        on all sides, or a string shortcut setting the padding at runtime:
1615        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
1616        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
1617        (full convolution), ``'half'`` for ``(kernel_rows // 2,
1618        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
1619        odd-sized kernels). Note that the three widths are each
1620        applied twice, once per side (left and right, top and bottom, front
1621        and back).
1622    subsample
1623        The subsample operation applied to each output image. Should be a tuple
1624        with 3 elements. `(sv, sh, sl)` is equivalent to
1625        `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
1626        Set to `(1, 1, 1)` to disable subsampling.
1627    filter_dilation
1628        The filter dilation operation applied to each input image.
1629        Should be a tuple with 3 elements.
1630        Set to `(1, 1, 1)` to disable filter dilation.
1631    num_groups
1632        The number of distinct groups the image and kernel must be
1633        divided into.
1634        should be an int
1635        set to 1 to disable grouped convolution
1636
1637    Notes
1638    -----
1639    Currently, the Op requires the inputs, filters and outputs to be
1640    C-contiguous. Use :func:`gpu_contiguous
1641    <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
1642    if needed.
1643
1644    You can either enable the Theano flag `optimizer_including=conv_gemm`
1645    to automatically replace all convolution operations with `GpuCorr3dMM`
1646    or one of its gradients, or you can use it as a replacement for
1647    :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
1648    `GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently
1649    faster, but note that it computes a correlation -- if you need to
1650    compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`.
1651
1652    """
1653    def __init__(self, border_mode="valid",
1654                 subsample=(1, 1, 1),
1655                 filter_dilation=(1, 1, 1),
1656                 num_groups=1):
1657        super(GpuCorr3dMM, self).__init__(border_mode, subsample,
1658                                          filter_dilation, num_groups)
1659
1660    def make_node(self, img, kern):
1661        ctx_name = infer_context_name(img, kern)
1662        img = as_gpuarray_variable(img, ctx_name)
1663        kern = as_gpuarray_variable(kern, ctx_name)
1664        if img.type.ndim != 5:
1665            raise TypeError('img must be 5D tensor')
1666        if kern.type.ndim != 5:
1667            raise TypeError('kern must be 5D tensor')
1668
1669        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
1670                         False, False, False]
1671        return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
1672                                                      context_name=ctx_name,
1673                                                      broadcastable=broadcastable)()])
1674
1675    def c_code(self, node, nodename, inp, out_, sub):
1676        bottom, weights = inp
1677        top, = out_
1678        direction = "forward"
1679        return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
1680
1681    def grad(self, inp, grads):
1682        bottom, weights = inp
1683        top, = grads
1684        top = gpu_contiguous(top)
1685        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
1686                                          self.subsample,
1687                                          self.filter_dilation,
1688                                          self.num_groups)(
1689            weights, top, bottom.shape[-3:])
1690        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
1691                                            self.subsample,
1692                                            self.filter_dilation,
1693                                            self.num_groups)(
1694            bottom, top, weights.shape[-3:])
1695        return d_bottom, d_weights
1696
1697
1698class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
1699    """
1700    Gradient wrt. filters for `GpuCorr3dMM`.
1701
1702    Notes
1703    -----
1704    You will not want to use this directly, but rely on Theano's automatic
1705    differentiation or graph optimization to use it as needed.
1706
1707    """
1708
1709    def __init__(self, border_mode="valid",
1710                 subsample=(1, 1, 1),
1711                 filter_dilation=(1, 1, 1),
1712                 num_groups=1):
1713        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode,
1714                                                      subsample,
1715                                                      filter_dilation,
1716                                                      num_groups)
1717
1718    def make_node(self, img, topgrad, shape=None):
1719        ctx_name = infer_context_name(img, topgrad)
1720        img = as_gpuarray_variable(img, ctx_name)
1721        topgrad = as_gpuarray_variable(topgrad, ctx_name)
1722        if img.type.ndim != 5:
1723            raise TypeError('img must be 5D tensor')
1724        if topgrad.type.ndim != 5:
1725            raise TypeError('topgrad must be 5D tensor')
1726        if shape is None:
1727            if self.subsample != (1, 1, 1) or self.border_mode == "half":
1728                raise ValueError('shape must be given if subsample != (1, 1, 1)'
1729                                 ' or border_mode == "half"')
1730            height_width_depth = []
1731        else:
1732            height_width_depth = [shape[0], shape[1], shape[2]]
1733            assert shape[0].ndim == 0
1734            assert shape[1].ndim == 0
1735            assert shape[2].ndim == 0
1736
1737        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
1738                         False, False, False]
1739        return Apply(self, [img, topgrad] + height_width_depth,
1740                     [GpuArrayType(dtype=img.dtype,
1741                                   context_name=ctx_name,
1742                                   broadcastable=broadcastable)()])
1743
1744    def c_code(self, node, nodename, inp, out_, sub):
1745        bottom, top = inp[:2]
1746        height, width, depth = inp[2:] or (None, None, None)
1747        weights, = out_
1748        direction = "backprop weights"
1749        return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
1750
1751    def grad(self, inp, grads):
1752        bottom, top = inp[:2]
1753        weights, = grads
1754        weights = gpu_contiguous(weights)
1755        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
1756                                          self.subsample,
1757                                          self.filter_dilation,
1758                                          self.num_groups)(weights,
1759                                                           top,
1760                                                           bottom.shape[-3:])
1761        d_top = GpuCorr3dMM(
1762            self.border_mode, self.subsample, self.filter_dilation,
1763            self.num_groups)(bottom, weights)
1764        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
1765            * 3 if len(inp) == 5 else ()
1766        return (d_bottom, d_top) + d_height_width_depth
1767
1768    def connection_pattern(self, node):
1769        if node.nin == 2:
1770            return [[1], [1]]
1771        else:
1772            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
1773
1774
1775class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
1776    """
1777    Gradient wrt. inputs for `GpuCorr3dMM`.
1778
1779    Notes
1780    -----
1781    You will not want to use this directly, but rely on Theano's automatic
1782    differentiation or graph optimization to use it as needed.
1783
1784    """
1785
1786    def __init__(self, border_mode="valid",
1787                 subsample=(1, 1, 1),
1788                 filter_dilation=(1, 1, 1),
1789                 num_groups=1):
1790        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
1791                                                     filter_dilation, num_groups)
1792
1793    def make_node(self, kern, topgrad, shape=None):
1794        ctx_name = infer_context_name(kern, topgrad)
1795        kern = as_gpuarray_variable(kern, ctx_name)
1796        topgrad = as_gpuarray_variable(topgrad, ctx_name)
1797        if kern.type.ndim != 5:
1798            raise TypeError('kern must be 5D tensor')
1799        if topgrad.type.ndim != 5:
1800            raise TypeError('topgrad must be 5D tensor')
1801        if shape is None:
1802            if self.subsample != (1, 1, 1):
1803                raise ValueError('shape must be given if subsample != (1, 1, 1)')
1804            height_width_depth = []
1805        else:
1806            height_width_depth = [shape[0], shape[1], shape[2]]
1807            assert shape[0].ndim == 0
1808            assert shape[1].ndim == 0
1809            assert shape[2].ndim == 0
1810
1811        if self.num_groups > 1:
1812            broadcastable = [topgrad.type.broadcastable[0], False,
1813                             False, False, False]
1814        else:
1815            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-4],
1816                             False, False, False]
1817        return Apply(self, [kern, topgrad] + height_width_depth,
1818                     [GpuArrayType(dtype=topgrad.dtype,
1819                                   context_name=ctx_name,
1820                                   broadcastable=broadcastable)()])
1821
1822    def c_code(self, node, nodename, inp, out_, sub):
1823        weights, top = inp[:2]
1824        height, width, depth = inp[2:] or (None, None, None)
1825        bottom, = out_
1826        direction = "backprop inputs"
1827        return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
1828
1829    def grad(self, inp, grads):
1830        weights, top = inp[:2]
1831        bottom, = grads
1832        bottom = gpu_contiguous(bottom)
1833        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
1834                                            self.subsample,
1835                                            self.filter_dilation,
1836                                            self.num_groups)(bottom,
1837                                                             top,
1838                                                             weights.shape[-3:])
1839        d_top = GpuCorr3dMM(self.border_mode,
1840                            self.subsample,
1841                            self.filter_dilation,
1842                            self.num_groups)(bottom, weights)
1843        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
1844            * 3 if len(inp) == 5 else ()
1845        return (d_weights, d_top) + d_height_width_depth
1846
1847    def connection_pattern(self, node):
1848        if node.nin == 2:
1849            return [[1], [1]]
1850        else:
1851            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
1852
1853
1854@inplace_allocempty(GpuGemv, 0)
1855def local_inplace_gpuagemv(node, inputs):
1856    return [gpugemv_inplace(*inputs)]
1857
1858
1859@inplace_allocempty(GpuGemm, 0)
1860def local_inplace_gpuagemm(node, inputs):
1861    return [gpugemm_inplace(*inputs)]
1862
1863
1864@inplace_allocempty(GpuGer, 0)
1865def local_inplace_gpuager(node, inputs):
1866    return [gpuger_inplace(*inputs)]
1867
1868
1869@inplace_allocempty(GpuGemmBatch, 0)
1870def local_inplace_gpuagemmbatch(node, inputs):
1871    return [gpugemmbatch_inplace(*inputs)]
1872
1873gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
1874                                            local_inplace_gpuagemm,
1875                                            local_inplace_gpuager,
1876                                            local_inplace_gpuagemmbatch),
1877                              name='gpuablas_opt_inplace')
1878
1879optdb.register('InplaceGpuaBlasOpt',
1880               gpuablas_opt_inplace,
1881               70.0, 'fast_run', 'inplace', 'gpuarray')
1882