1from __future__ import absolute_import, print_function, division
2import copy
3import numpy as np
4
5import theano
6from theano import Apply, scalar, Op
7from six.moves import StringIO, xrange
8from theano.gof.utils import MethodNotDefined
9from theano.scalar import Scalar, Composite
10from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
11from theano.scalar.basic_scipy import Erfinv, Erfcinv
12from theano.scalar.basic import upgrade_to_float_no_complex, complex_types
13
14try:
15    import pygpu
16    from pygpu import gpuarray
17    from pygpu.tools import ArrayArg
18    from pygpu.reduction import ReductionKernel
19    from pygpu.gpuarray import dtype_to_typecode
20except ImportError:
21    pass
22
23from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
24                        infer_context_name)
25from .type import GpuArrayType, gpu_context_type
26from .fp16_help import load_w, write_w
27
28
29def make_argument(v, name):
30    return ArrayArg(np.dtype(v.type.dtype), name)
31
32
33def as_C_string_const(s):
34    return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
35                     for l in s.split('\n'))
36
37
38def get_scal(dt):
39    if dt == 'float16':
40        dt = 'float32'
41    return scalar.get_scalar_type(dt)
42
43
44def max_inputs_to_GpuElemwise(node_or_outputs):
45    """
46    Compute the maximum number of inputs that fit in a kernel call.
47    """
48    if isinstance(node_or_outputs, Apply):
49        outputs = node_or_outputs.outputs
50    else:
51        outputs = node_or_outputs
52
53    n_out = len(outputs)
54    ndim = outputs[0].type.ndim
55
56    ptr_size = 8
57    # Even with call32, the interface does not change, and shapes,
58    # strides, and offset are passed as 64-bits (8 bytes)
59    int_size = 8
60
61    # we take the limit from CUDA for now
62    nb_bytes_total = 4096
63
64    # Regardless of the number of arguments, we have:
65    # - The total number of elements (int)
66    # - The shape (int) on each dimension
67    fixed_size = int_size + int_size * ndim
68
69    # Each argument (input or output) has:
70    # - 1 pointer (ptr)
71    # - 1 offset (int)
72    # - 1 stride (int) per dimension
73    # Even if the tensor ends up being contiguous, code for the
74    # non-contiguous case still needs to be generated.
75    param_size = ptr_size + int_size + int_size * ndim
76
77    # Remaining for inputs
78    nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out
79
80    # Maximum number of inputs
81    max_nb_inputs = nb_bytes_for_inputs // param_size
82
83    return max_nb_inputs
84
85
86class GpuElemwise(HideC, Elemwise):
87    """
88    Elemwise on the GPU.
89
90    """
91    params_type = gpu_context_type
92    nin = property(lambda self: self.scalar_op.nin)
93    nout = property(lambda self: self.scalar_op.nout)
94    _f16_ok = True
95
96    def __str__(self):
97        if self.name is not None:
98            return self.name
99        items = str(sorted(self.inplace_pattern.items()))
100        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
101
102    def max_inputs(self, node_or_outputs):
103        return max_inputs_to_GpuElemwise(node_or_outputs)
104
105    def make_node(self, *inputs):
106        ctx_name = infer_context_name(*inputs)
107        inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
108        out_info = Elemwise.get_output_info(self, GpuDimShuffle, *inputs)
109        inputs = out_info[2]
110        outputs = [GpuArrayType(broadcastable=br,
111                                context_name=ctx_name,
112                                dtype=dtype)() for dtype, br in
113                   zip(out_info[0], out_info[1])]
114        if len(outputs) > 1:
115            raise NotImplementedError()
116
117        if len(inputs) > max_inputs_to_GpuElemwise(outputs):
118            raise NotImplementedError(
119                "Can not make this GpuElemwise with that much inputs")
120
121        # Try to generate the kernel to catch SupportCodeErrors
122        scal_ins = [get_scal(i.dtype) for i in inputs]
123        fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])
124        try:
125            code = fake_node.op.c_support_code_apply(fake_node, "test")
126            if code:
127                raise SupportCodeError(code)
128        except MethodNotDefined:
129            pass
130        try:
131            support_code = fake_node.op.c_support_code()
132            if "struct" in support_code:
133                # The macro is fine, the C++ struct is not.
134                raise SupportCodeError(
135                    "struct aren't supported in GpuElemwise support_code" +
136                    support_code)
137        except MethodNotDefined:
138            pass
139
140        node = Apply(self, inputs, outputs)
141        return node
142
143    def get_params(self, node):
144        return node.inputs[0].type.context
145
146    def _get_vnames(self, node):
147        inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
148        outs = ['o%d' % (n,) if n not in self.inplace_pattern else
149                inps[self.inplace_pattern[n]]
150                for n, _ in enumerate(node.outputs)]
151        return inps, outs
152
153    def _generate_op_string(self, node):
154        inps, outs = self._get_vnames(node)
155        scal_v_ins = [get_scal(i.dtype)() for i in node.inputs]
156
157        # As float16 isn't a c type and most GPU don't compute on it,
158        # We convert the computation to float32, and let libgpuarray
159        # load in float16 and cast to float32 and do the reverse for
160        # the output.
161        scalar_op = self.scalar_op
162        if isinstance(scalar_op, (scalar.Cast, Composite)):
163            scalar_op = scalar_op.clone_float32()
164        fake_node = scalar_op.make_node(*scal_v_ins)
165        scal_v_out = fake_node.outputs
166        assert len(scal_v_out) == len(node.outputs)
167
168        try:
169            kop = fake_node.op.c_code(fake_node, 'elem_scalar',
170                                      inps, outs,
171                                      dict(fail='return;'))
172        except MethodNotDefined:
173            raise AssertionError(
174                "No c code for this scalar. Can not make a GpuElemwise")
175        # If the following assert fail, then we need to update the
176        # code handler above.
177        assert 'npy_float16' not in kop
178
179        support_code = ""
180        try:
181            # We accept only some c_support_code().
182            # This filter is done in the make_node()
183            support_code += fake_node.op.c_support_code()
184        except MethodNotDefined:
185            pass
186        for npy, ga in [("npy_bool", "ga_bool"),
187                        ("npy_uint8", "ga_ubyte"),
188                        ("npy_uint16", "ga_ushort"),
189                        ("npy_uint32", "ga_uint"),
190                        ("npy_uint64", "ga_ulong"),
191                        ("npy_int8", "ga_byte"),
192                        ("npy_int16", "ga_short"),
193                        ("npy_int32", "ga_int"),
194                        ("npy_int64", "ga_long"),
195                        ("npy_float16", "ga_half"),
196                        ("npy_float32", "ga_float"),
197                        ("npy_float64", "ga_double"),
198                        ]:
199            kop = kop.replace(npy, ga)
200        return support_code, kop
201
202    def c_headers(self):
203        return ['<numpy_compat.h>', '<gpuarray/types.h>',
204                '<gpuarray/elemwise.h>']
205
206    def c_support_code_struct(self, node, name):
207        return "\nGpuElemwise *ge;\n"
208
209    def c_init_code_struct(self, node, name, sub):
210        inps, outs = self._get_vnames(node)
211        nargs = len(inps) + len(outs) - len(self.inplace_pattern)
212        support_code, kop = self._generate_op_string(node)
213        res = """
214        gpuelemwise_arg args[%(nargs)s] = {{0}};
215        """ % dict(nargs=nargs)
216
217        for n, (i, name) in enumerate(zip(node.inputs, inps)):
218            res += """
219            args[%(n)s].name = %(name)s;
220            args[%(n)s].typecode = %(typecode)s;
221            args[%(n)s].flags = GE_READ;
222            """ % dict(n=n, name='"%s"' % (name,),
223                       typecode=i.type.typecode)
224
225        p = len(inps)
226        for n, o in enumerate(node.outputs):
227            if n in self.inplace_pattern:
228                assert(len(node.outputs) == 1)
229                res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
230            else:
231                res += """
232                args[%(n)s].name = %(name)s;
233                args[%(n)s].typecode = %(typecode)s;
234                args[%(n)s].flags = GE_WRITE;
235                """ % dict(n=p, name='"%s"' % (outs[n],),
236                           typecode=o.type.typecode)
237                p += 1
238
239        res += """
240        ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, GE_CONVERT_F16);
241        if (ge == NULL) {
242           PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
243           %(fail)s
244        }
245        """ % dict(nargs=nargs, ctx=sub['params'], fail=sub['fail'],
246                   support=as_C_string_const(support_code),
247                   kop=as_C_string_const(kop), nd=node.inputs[0].ndim)
248
249        return res
250
251    def c_cleanup_code_struct(self, node, name):
252        return """
253        GpuElemwise_free(ge);
254        """
255
256    def c_code(self, node, name, inputs, outputs, sub):
257        nd = node.outputs[0].ndim
258        fail = sub["fail"]
259        initial_dims = ','.join('1' for i in xrange(nd))
260        opname = str(self.scalar_op)
261        ctx = sub['params']
262        nargs = len(node.inputs) + len(node.outputs) - len(self.inplace_pattern)
263
264        # check that all inputs have valid dimensions
265        emitted_inames = {}
266        code = """
267        // +1 is so that MSVC is happy when nd == 0
268        size_t dims[%(nd)s+1] = {%(initial_dims)s};
269        void *rargs[%(nargs)s] = {0};
270        int err;
271        """ % locals()
272        for idx, iname in enumerate(inputs):
273            if iname in emitted_inames:
274                assert emitted_inames[iname] is node.inputs[idx]
275                continue
276
277            broadcasts = map(int, node.inputs[idx].broadcastable)
278            broadcasts = ', '.join(map(str, broadcasts))
279            nd = node.inputs[idx].ndim
280            code += """
281            int broadcasts_%(iname)s[%(nd)s+1] = {%(broadcasts)s};
282            """ % locals()
283            emitted_inames[iname] = node.inputs[idx]
284
285        # check that all inputs have valid dimensions
286        emitted_inames = {}
287        for idx, iname in enumerate(inputs):
288            code += "rargs[%(idx)s] = &%(iname)s->ga;\n" % dict(idx=idx, iname=iname)
289            if iname in emitted_inames:
290                continue
291            code += """
292        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
293        {
294            PyErr_Format(PyExc_TypeError,
295                         "need %(nd)s dims, not %%u",
296                         PyGpuArray_NDIM(%(iname)s));
297            %(fail)s;
298        }
299        for (int i = 0; i< %(nd)s; ++i)
300        {
301            dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
302            if ((!(broadcasts_%(iname)s[i] &&
303                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
304                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
305            {
306                PyErr_Format(PyExc_ValueError,
307                             "GpuElemwise. Input dimension mis-match. Input"
308                             " %(idx)d (indices start at 0) has shape[%%d] == %%llu"
309                             ", but the output's size on that axis is %%llu.",
310                             i,
311                             (unsigned long long)PyGpuArray_DIMS(%(iname)s)[i],
312                             (unsigned long long)dims[i]
313                            );
314                %(fail)s;
315            }
316        }
317            """ % locals()
318            emitted_inames[iname] = True
319        # check that all outputs have valid dimensions
320        p = len(node.inputs)
321        for idx, oname in enumerate(outputs):
322            typecode = dtype_to_typecode(node.outputs[idx].dtype)
323            if idx not in self.inplace_pattern.keys():
324                code += """
325        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
326            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
327            {
328                Py_DECREF(%(oname)s);
329                %(oname)s = NULL;
330            }
331        }
332        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
333        {
334            Py_XDECREF(%(oname)s);
335            %(oname)s = NULL;
336        }
337        if (NULL == %(oname)s)
338        {
339            %(oname)s = pygpu_empty(%(nd)d, dims,
340                            %(typecode)s, GA_C_ORDER,
341                            %(ctx)s, Py_None);
342            if (!%(oname)s) {
343                %(fail)s
344            }
345        }
346        rargs[%(p)s] = &%(oname)s->ga;
347                """ % locals()
348                p += 1
349            else:
350                input_idx = self.inplace_pattern[idx]
351                iname = inputs[input_idx]
352                code += """
353        Py_XDECREF(%(oname)s);
354        %(oname)s = %(iname)s;
355        Py_INCREF(%(oname)s);
356        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
357            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
358            {
359                PyErr_Format(PyExc_ValueError,
360                             "GpuElemwise. Output dimension mis-match. Output"
361                             " %(idx)d (indices start at 0), working inplace"
362                             " on input %(input_idx)s, has shape[%%i] == %%llu"
363                             ", but the output's size on that axis is %%llu.",
364                             i,
365                             (unsigned long long)PyGpuArray_DIMS(%(oname)s)[i],
366                             (unsigned long long)dims[i]
367                            );
368                Py_DECREF(%(oname)s);
369                %(oname)s = NULL;
370                %(fail)s;
371            }
372        }
373        """ % locals()
374
375        code += """
376        if (GpuElemwise_call(ge, rargs, GE_BROADCAST) != GA_NO_ERROR) {
377          PyErr_SetString(PyExc_RuntimeError, "Error in the elemwise call");
378          %(fail)s
379        }
380        """ % dict(fail=sub['fail'])
381
382        return str(code)
383
384    # To disable the superclass perform.
385    perform = Op.perform
386
387    # Since we don't have a perform ...
388    def python_constant_folding(self, node):
389        return False
390
391    def c_code_cache_version(self):
392        ver = self.scalar_op.c_code_cache_version()
393        if ver:
394            return (10, ver)
395        else:
396            return ver
397
398
399class SupportCodeError(Exception):
400    """
401    We do not support certain things (such as the C++ complex struct).
402
403    """
404
405
406class GpuDimShuffle(DimShuffle):
407    """
408    DimShuffle on the GPU.
409
410    """
411    _f16_ok = True
412    c_func_name = 'APPLY_SPECIFIC(gpu_dimshuffle)'
413
414    def make_node(self, input):
415        ctx_name = infer_context_name(input)
416        res = DimShuffle.make_node(self, input)
417        otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
418                             broadcastable=res.outputs[0].type.broadcastable,
419                             context_name=ctx_name)
420        input = as_gpuarray_variable(input, ctx_name)
421        return Apply(self, [input], [otype()])
422
423    def __str__(self):
424        if self.inplace:
425            s = "InplaceGpuDimShuffle{%s}"
426        else:
427            s = "GpuDimShuffle{%s}"
428        return s % (','.join(str(x) for x in self.new_order))
429
430    def perform(self, node, inp, out, params):
431        input, = inp
432        storage, = out
433
434        res = input
435
436        res = res.transpose(self.shuffle + self.drop)
437
438        shape = list(res.shape[:len(self.shuffle)])
439        for augm in self.augment:
440            shape.insert(augm, 1)
441        res = res.reshape(shape)
442
443        if not self.inplace:
444            res = res.copy()
445
446        storage[0] = res
447
448
449class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
450    """
451    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
452
453    Parameters
454    ----------
455    reduce_mask
456        The dimensions along which to reduce. The `reduce_mask` is a tuple of
457        booleans (actually integers 0 or 1) that specify for each input
458        dimension, whether to reduce it (1) or not (0).
459    pre_scalar_op
460        If present, must be a scalar op with only 1 input. We will execute it
461        on the input value before reduction.
462
463    Examples
464    --------
465    When scalar_op is a theano.scalar.basic.Add instance:
466
467      - reduce_mask == (1,) sums a vector to a scalar
468
469      - reduce_mask == (1,0) computes the sum of each column in a matrix
470
471      - reduce_mask == (0,1) computes the sum of each row in a matrix
472
473      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
474
475    Notes
476    -----
477    Any reduce_mask of all zeros is a sort of 'copy', and may be removed during
478    graph optimization.
479
480    This Op is a work in progress.
481
482    This op was recently upgraded from just GpuSum a general CAReduce. Not
483    many code cases are supported for scalar_op being anything other than
484    scalar.Add instances yet.
485
486    Important note: if you implement new cases for this op, be sure to
487    benchmark them and make sure that they actually result in a speedup.
488    GPUs are not especially well-suited to reduction operations so it is
489    quite possible that the GPU might be slower for some cases.
490
491    """
492    __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
493                 'pre_scalar_op')
494    _f16_ok = True
495    verbose = 0
496
497    def __init__(self, scalar_op, axis=None,
498                 reduce_mask=None, dtype=None, acc_dtype=None,
499                 pre_scalar_op=None):
500        if reduce_mask is not None:
501            reduce_mask = tuple(reduce_mask)
502        self.reduce_mask = reduce_mask
503
504        # used to make sure that calls to scalar op
505        # have unique name arguments
506        self._n_scalar_op_calls = 0
507        CAReduceDtype.__init__(self, scalar_op, axis=axis,
508                               dtype=dtype, acc_dtype=acc_dtype)
509        self.pre_scalar_op = pre_scalar_op
510        if pre_scalar_op:
511            assert pre_scalar_op.nin == 1
512
513    def __str__(self):
514        pre = ""
515        if self.pre_scalar_op:
516            pre = "pre=%s,red=" % str(self.pre_scalar_op)
517        ax = ''
518        if self.axis is not None:
519            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
520        return "GpuCAReduceCuda{%s%s}%s" % (pre, str(self.scalar_op), ax)
521
522    def __setstate__(self, d):
523        self.__dict__.update(d)
524        # For unpickling of old ops.
525        if not hasattr(self, "pre_scalar_op"):
526            self.pre_scalar_op = None
527
528    def make_node(self, x):
529        x = as_gpuarray_variable(x, infer_context_name(x))
530        if x.type.context.kind != b'cuda':
531            raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
532        ret = super(GpuCAReduceCuda, self).make_node(x)
533        self = copy.copy(self)
534        self.axis = ret.op.axis
535        if self.pre_scalar_op:
536            # Currently we only tested pre_scalar_op that don't cause
537            # upcast.
538            assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
539        if self.reduce_mask is None:
540            if self.axis is None:
541                reduce_mask = [1] * x.type.ndim
542            else:
543                reduce_mask = [0] * x.type.ndim
544                for a in self.axis:
545                    assert reduce_mask[a] == 0
546                    reduce_mask[a] = 1
547            self.reduce_mask = tuple(reduce_mask)
548
549        if (x.type.ndim != len(self.reduce_mask)):
550            raise TypeError("x must have rank %i" % len(self.reduce_mask))
551        if ("complex" in x.dtype or
552                "complex" in ret.outputs[0].dtype or
553                "complex" in self._acc_dtype(x.dtype)):
554            raise NotImplementedError("We don't support complex in gpu reduction")
555        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
556                                              ret.outputs[0].type.broadcastable,
557                                              context_name=x.type.context_name)()])
558
559    def perform(self, node, inp, out, ctx):
560        theano.Op.perform(self, node, inp, out, ctx)
561
562    def supports_c_code(self, inputs):
563        """
564        Returns True if the current op and reduce pattern has functioning C code.
565
566        """
567        # If we don't even have the right method, we certainly
568        # don't support the C code
569        # (This is the test that used to be implemented by
570        # local_gpu_sum)
571        pattern = (''.join(str(i) for i in self.reduce_mask))
572        if not hasattr(self, 'c_code_reduce_%s' % pattern):
573            return False
574
575        # Now that this is a general reduction op, we might
576        # have a method for a pattern, but that pattern
577        # might not be implemented for the current scalar op.
578        # To detect this more complicated situation, we
579        # make fake arguments to c_code, try to run them,
580        # and see if NotImplementedError gets raised.
581
582        node = self.make_node(*inputs)
583
584        name = 'fake_name'
585
586        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
587        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
588
589        sub = {'fail': 'fake failure code', 'params': 'fake context'}
590
591        try:
592            self.c_code(node, name, inp, out, sub)
593            if not self.gpu_kernels(node, name):
594                return False
595        except NotImplementedError:
596            return False
597        return True
598
599    def c_headers(self):
600        return ['<numpy_compat.h>', '<gpuarray/types.h>']
601
602    def c_support_code(self):
603        return """
604        template <typename T>
605        static T ceil_intdiv(T a, T b)
606        {
607            return (a/b) + ((a % b) ? 1: 0);
608        }
609        """
610
611    def c_code(self, node, name, inp, out, sub):
612        x, = inp
613        z, = out
614
615        nd_in = node.inputs[0].type.ndim
616        nd_out = node.outputs[0].type.ndim
617        # For complex, we need to use theano_complex* in the c code to
618        # have it run. But libgpuarray don't understand it.
619        in_dtype = node.inputs[0].type.dtype_specs()[1]
620        out_dtype = node.outputs[0].type.dtype_specs()[1]
621        gin_dtype = "npy_" + node.inputs[0].dtype
622        gout_dtype = "npy_" + node.outputs[0].dtype
623        assert nd_in - nd_out == sum(self.reduce_mask)
624
625        sio = StringIO()
626        fail = sub['fail']
627        ctx = sub['params']
628
629        # check input
630        print("""
631        if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
632        {
633            PyErr_Format(PyExc_TypeError,
634                         "required nd=%(nd_in)s, got nd=%%u", PyGpuArray_NDIM(%(x)s));
635            %(fail)s;
636        }
637        """ % locals(), file=sio)
638
639        # It might be nice to use a property of the op class to do this,
640        # but tensor.elemwise.CAReduce has this exact same check so I guess
641        # this is OK to do
642        if self.scalar_op in [scalar.minimum, scalar.maximum]:
643            conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i)
644                     for i in xrange(nd_in)
645                     if self.reduce_mask[i]]
646            assert len(conds) > 0
647            cond = "(" + " || ".join(conds) + ")"
648            print("""
649            if %(cond)s
650            {
651                PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
652                %(fail)s;
653            }
654            """ % locals(), file=sio)
655
656        #
657        # alloc an output if we need one
658        #
659
660        # check the basics of out output
661        print("""
662        if (  !%(z)s
663           || (PyGpuArray_NDIM(%(z)s) != %(nd_out)s)
664        """ % locals(), file=sio)
665
666        # ensure that the output has the right non-reduced dimensions
667        j = 0
668        for i in xrange(nd_in):
669            if not self.reduce_mask[i]:
670                print(" || (PyGpuArray_DIMS(%(z)s)[%(j)s] != PyGpuArray_DIMS(%(x)s)[%(i)d]) " % locals(), file=sio)
671                j += 1
672
673        print("""
674           )
675        {
676            """ % locals(), file=sio)
677        if nd_out > 0:
678            print("size_t new_dims[%(nd_out)s]; " % locals(), file=sio)
679        else:
680            print("size_t *new_dims=NULL; ", file=sio)
681
682        j = 0
683        for i in xrange(nd_in):
684            if not self.reduce_mask[i]:
685                print('new_dims[%(j)s] = PyGpuArray_DIMS(%(x)s)[%(i)s];' % locals(), file=sio)
686                j += 1
687        out_typecode = dtype_to_typecode(gout_dtype[4:])
688        print("""
689            Py_XDECREF(%(z)s);
690            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
691                                %(out_typecode)s, GA_C_ORDER,
692                                %(ctx)s, Py_None);
693            if (NULL == %(z)s)
694            {
695                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
696                %(fail)s;
697            }
698        }
699        """ % locals(), file=sio)
700
701        # \begin bracket the reduction in a check that there is
702        # actually work to do
703        if getattr(self.scalar_op, 'identity', None) == 0:
704            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
705        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
706        else:
707            scalar_op = self.scalar_op
708            zero_shp = """
709            PyErr_Format(PyExc_NotImplementedError,
710                         "GpuCAReduceCuda not implemented when input shape is 0"
711                         " for this scalar_op: %(scalar_op)s");
712            %(fail)s;
713            """ % locals()
714        print("""
715        if (PyGpuArray_SIZE(%(z)s) && ! PyGpuArray_SIZE(%(x)s)){
716            %(zero_shp)s;
717        }
718        else if (PyGpuArray_SIZE(%(z)s))
719        {
720        """ % locals(), file=sio)
721
722        #
723        # Now perform the reduction
724        #
725
726        if all(i == 1 for i in self.reduce_mask):
727            # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
728            # TODO: check if we are ccontiguous when we un-dimshuffle
729            # TODO: if only some dims are ccontiguous, call version with less dims.
730            print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
731                  file=sio)
732            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
733            print("}else{", file=sio)
734            getattr(self, 'c_code_reduce_%s' %
735                    (''.join(str(i) for i in self.reduce_mask)))(
736                sio, node, name, x, z, fail)
737            print("}", file=sio)
738        else:
739            getattr(self, 'c_code_reduce_%s' % (''.join(
740                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
741
742        # \end bracket the reduction ...
743        print("""
744        }
745        """ % locals(), file=sio)
746
747        return sio.getvalue()
748
749    def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()):
750        """
751        Return a string for making a kernel call.
752
753        The return value looks something like:
754
755            .. code-block:: c
756
757                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
758                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
759                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
760                if (verbose)
761                    printf("running kernel_reduce_10_%(name)s\\n");
762                size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
763                void *kernel_params[] = {
764                        (void *)&PyGpuArray_DIMS(%(x)s)[0],
765                        (void *)&PyGpuArray_DIMS(%(x)s)[1],
766                        (void *)%(x)s->ga.data,
767                        (void *)&%(x)s->ga.offset,
768                        (void *)&stride_A0,
769                        (void *)&stride_A1,
770                        (void *)%(z)s->ga.data,
771                        (void *)&%(z)s->ga.offset,
772                        (void *)&stride_Z0};
773                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
774                %(err_check)s
775        """
776        in_dtype = "npy_" + node.inputs[0].dtype
777        out_dtype = "npy_" + node.outputs[0].dtype
778        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
779        sio = StringIO()
780        if pattern is None:
781            pattern = ''.join(str(c) for c in self.reduce_mask)
782        ndim = len(self.reduce_mask)
783        nd_out = ndim - sum(self.reduce_mask)
784        shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
785        shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
786                                for i in range(node.inputs[0].ndim)])
787        k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
788        params = []
789
790        for i in xrange(ndim):
791            params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
792        for declaration, value in extra_dims:
793            print(declaration % locals(), file=sio)
794            params.append(value)
795        params.append("(void *)%(x)s->ga.data" % locals())
796        params.append("(void *)&%(x)s->ga.offset" % locals())
797        for i in xrange(ndim):
798            print("""
799            ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
800            """ % locals(), file=sio)
801            params.append("(void *)&stride_A%(i)d" % locals())
802        for declaration, value in extra_strides:
803            print(declaration % locals(), file=sio)
804            params.append(value)
805
806        params.append("(void *)%(z)s->ga.data" % locals())
807        params.append("(void *)&%(z)s->ga.offset" % locals())
808        for i in xrange(nd_out):
809            print("""
810            ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
811            """ % locals(), file=sio)
812            params.append("(void *)&stride_Z%(i)d" % locals())
813        kernel_params = ', '.join(params)
814        err_check = """
815            if (err != GA_NO_ERROR) {
816                PyErr_Format(PyExc_RuntimeError,
817                             "gpuarray error: %(k_var)s: %%s.",
818                             GpuKernel_error(&%(k_var)s, err));
819                %(fail)s;
820            }
821        """ % locals()
822        print("""
823            if (verbose)
824                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
825            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
826            void *kernel_params[] = { %(kernel_params)s };
827            if (verbose>1)
828                printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
829                       "n_threads[2]=%%lu, n_threads=%%lu, "
830                       "n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
831                       "n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
832                                  n_threads[0],n_threads[1],
833                                  n_threads[2],
834                                  n_threads[0]*n_threads[1]*
835                                  n_threads[2],
836                                  n_blocks[0],n_blocks[1],n_blocks[2],
837                                  n_blocks[0]*n_blocks[1]*n_blocks[2],
838                                  n_shared, %(shapes_data)s);
839            int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
840            %(err_check)s
841            """ % locals(), file=sio)
842
843        return sio.getvalue()
844
845    def _k_decl(self, node, nodename, pattern=None,
846                ndim=None, reduce_mask=None):
847        """
848        Return a string to declare a kernel function.
849
850        The result will look something like this:
851
852        .. code-block:: c
853
854            KERNEL void kernel_reduce_110_%(nodename)s(
855                    const ga_size d0,
856                    const ga_size d1,
857                    const ga_size d2,
858                    const %(in_type)s *A,
859                    const ga_size offset_A,
860                    const ga_ssize sA0,
861                    const ga_ssize sA1,
862                    const ga_ssize sA2,
863                    %(out_type)s * Z,
864                    const ga_size offset_Z,
865                    const ga_ssize sZ0)
866
867        Since the nodename is unique, we don't need to put the name
868        of the scalar_op in here.
869
870        """
871        in_dtype = node.inputs[0].dtype
872        out_dtype = node.outputs[0].dtype
873        in_type = gpuarray.dtype_to_ctype(in_dtype)
874        out_type = gpuarray.dtype_to_ctype(out_dtype)
875        if reduce_mask is None:
876            reduce_mask = self.reduce_mask
877        if ndim is None:
878            ndim = len(reduce_mask)
879        if pattern is None:
880            pattern = ''.join(str(i) for i in reduce_mask)
881        kname = "kernel_reduce_%(pattern)s" % locals()
882        k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
883        params = []
884        sio = StringIO()
885
886        print("""
887            KERNEL void %(kname)s(
888        """ % locals(), file=sio)
889        for i in xrange(ndim):
890            params.append('uintp')
891            print("""
892                    const ga_size d%(i)s,
893        """ % locals(), file=sio)
894        params.append(gpuarray.GpuArray)
895        params.append('uintp')
896        print("""
897                    const %(in_type)s *A, const ga_size offset_A,
898        """ % locals(), file=sio)
899        for i in xrange(ndim):
900            params.append('intp')
901            print("""
902                    const ga_ssize sA%(i)s,
903        """ % locals(), file=sio)
904        params.append(gpuarray.GpuArray)
905        params.append('uintp')
906        print("""
907                    %(out_type)s * Z, const ga_size offset_Z
908        """ % locals(), file=sio)
909        for i in xrange(ndim - sum(reduce_mask)):
910            params.append('intp')
911            print("""
912                    , const ga_ssize sZ%(i)s
913        """ % locals(), file=sio)
914        print(")", file=sio)
915        return sio.getvalue(), kname, params, k_var
916
917    def _k_init(self, node, nodename):
918        in_dtype = node.inputs[0].dtype
919        out_dtype = node.outputs[0].dtype
920        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
921        # We need to use theano_complex* and not npy_complex*
922        in_type = gpuarray.dtype_to_ctype(in_dtype)
923        out_type = gpuarray.dtype_to_ctype(out_dtype)
924        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
925
926        return """
927                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
928                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
929                + threadIdx.y * blockDim.x + threadIdx.x;
930                extern __shared__ %(acc_type)s buf[];
931                A = (const %(in_type)s *)(((char *)A)+offset_A);
932                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
933                %(acc_type)s myresult = 0;
934        """ % locals()
935
936    def _assign_init(self, first_item, dtype):
937        """
938        This return the initial value for myresult.
939        If the scalar op have an identity value, return it.
940
941        Otherwise, check that the scalar op is maximum or minimum
942        and return first_item. It should be the first element of the reduction.
943        As the maximum and minimum of the same value don't change, this work.
944
945        """
946        if hasattr(self.scalar_op, 'identity'):
947            return str(self.scalar_op.identity)
948        else:
949            assert isinstance(self.scalar_op, (scalar.Maximum,
950                                               scalar.Minimum))
951            if self.pre_scalar_op:  # TODO: multiple dtypes
952                # dtype = node.inputs[0].dtype
953
954                dummy_var = scalar.Scalar(dtype=dtype)()
955
956                dummy_node = self.pre_scalar_op.make_node(dummy_var)
957
958                dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
959                self._n_scalar_op_calls += 1
960                t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
961                                              (first_item,), ("",), {})
962                assert t.startswith(' = ')
963                first_item = t[3:]
964                if first_item[-1] == ';':
965                    first_item = first_item[:-1]
966
967            return first_item
968
969    def _assign_reduce(self, node, name, left, right, sub, pre):
970        """
971
972        Parameters
973        ----------
974        node
975            The node argument to this op's c_code.
976        name
977            The name argument to this op's c_code.
978        left
979            A C code string identifying an lvalue.
980        right
981            A C code string identifying an expression.
982        sub
983            The sub argument to this op's c_code.
984        pre
985            If True, we will add the pre_scalar_op.c_code.
986
987        Returns
988        -------
989        str
990            C code to reduce left and right, assigning the result to left.
991
992        """
993
994        x, = node.inputs
995        in_dtype = x.dtype
996        out_dtype = node.outputs[0].dtype
997
998        dummy_left = Scalar(dtype=out_dtype)()
999        dummy_right = Scalar(dtype=in_dtype)()
1000
1001        dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
1002
1003        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
1004        self._n_scalar_op_calls += 1
1005
1006        if pre and self.pre_scalar_op:
1007            assert left == "myresult"
1008            dummy_node = self.pre_scalar_op.make_node(dummy_left)
1009            dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
1010            self._n_scalar_op_calls += 1
1011            t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
1012                                          (right,), ("",), sub)
1013            assert t.startswith(' = ')
1014            right = t[3:]
1015            if right[-1] == ';':
1016                right = right[:-1]
1017
1018        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
1019                                     (left,), sub)
1020
1021    def _k_reduce_buf(self, z_pos, node, name, sub):
1022        """
1023        WRITEME
1024
1025        Parameters
1026        ----------
1027        node, name, sub
1028            These should be passed through from the original call to c_code.
1029
1030        """
1031        in_dtype = "npy_" + node.inputs[0].dtype
1032        out_dtype = "npy_" + node.outputs[0].dtype
1033        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
1034        write_out = write_w(node.outputs[0].dtype)
1035
1036        current_version = """
1037        __syncthreads(); // some kernel do multiple reduction.
1038        buf[threadNum] = myresult;
1039        __syncthreads();
1040
1041        // rest of function is handled by one warp
1042        if (threadNum < warpSize) {
1043            //round up all the partial sums into the first `warpSize` elements
1044            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
1045            {
1046                """
1047        current_version += self._assign_reduce(node, name,
1048                                               'myresult', 'buf[i]',
1049                                               sub, False) + """
1050            }
1051            buf[threadNum] = myresult;
1052        }
1053        __syncthreads();
1054        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
1055            if (threadNum < _n && threadNum + _n < threadCount)
1056            """
1057        current_version += self._assign_reduce(node, name, 'buf[threadNum]',
1058                                               'buf[threadNum+_n]', sub, False)
1059
1060        current_version += """
1061            __syncthreads();
1062        }
1063        if (threadNum == 0) {
1064          %(z_pos)s = %(write_out)s(buf[0]);
1065        }
1066        """
1067
1068        current_version = current_version % locals()
1069
1070        return current_version
1071
1072    # Threads must be organized as: threadNum%nb_reduce correspond to the same sum
1073    # nb_reduce<=warpSize
1074    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
1075        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False)
1076        write_out = write_w(node.outputs[0].dtype)
1077
1078        return """
1079        __syncthreads(); // some kernel do multiple reduction.
1080        buf[threadNum] = myresult;
1081        __syncthreads();
1082
1083        // rest of function is handled by one warp
1084        if (threadNum < %(nb_reduce)s)
1085        {
1086            //round up all the partial sums into the first `nb_reduce` elements
1087            for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
1088            {
1089                %(reduce_fct)s;
1090            }
1091            %(z_pos)s = %(write_out)s(myresult);
1092        }
1093        """ % locals()
1094
1095    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
1096        verbose = self.verbose
1097        in_dtype = "npy_" + node.inputs[0].dtype
1098        out_dtype = "npy_" + node.outputs[0].dtype
1099        if getattr(self.scalar_op, 'identity', None) == 0:
1100            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
1101        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
1102        else:
1103            zero_shp = """
1104            PyErr_Format(PyExc_NotImplementedError,
1105                         "GpuCAReduceCuda not implemented when input shape is 0 for this scalar_op");
1106            %(fail)s;
1107            """ % locals()
1108
1109        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
1110        k_var = "kernel_reduce_ccontig_%(name)s" % locals()
1111        err_check = """
1112            if (err != GA_NO_ERROR) {
1113                PyErr_Format(PyExc_RuntimeError,
1114                             "gpuarray error: %(k_var)s: %%s.",
1115                             GpuKernel_error(&%(k_var)s, err));
1116                %(fail)s;
1117            }
1118        """ % locals()
1119
1120        print("""
1121        {
1122          if(PyGpuArray_SIZE(%(x)s)==0){
1123            %(zero_shp)s;
1124          }else{
1125            int verbose = %(verbose)s;
1126            size_t numEls = PyGpuArray_SIZE(%(x)s);
1127            size_t n_threads = std::min(numEls, (size_t) 256);
1128            size_t n_blocks = 1;
1129            void *kernel_params[] = {(void *)&numEls,
1130                                     (void *)%(x)s->ga.data,
1131                                     (void *)&%(x)s->ga.offset,
1132                                     (void *)%(z)s->ga.data,
1133                                     (void *)&%(z)s->ga.offset};
1134            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
1135                                " n_threads=%%llu, size=%%llu, ndim=%%u\\n",
1136                                n_threads, numEls,
1137                                PyGpuArray_NDIM(%(x)s));
1138            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
1139            int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
1140            %(err_check)s
1141         }
1142        }
1143        """ % locals(), file=sio)
1144
1145    def c_code_reduce_1(self, sio, node, name, x, z, fail):
1146        verbose = self.verbose
1147        makecall = self._makecall(node, name, x, z, fail)
1148        print("""
1149        {
1150            int verbose = %(verbose)s;
1151            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
1152            size_t n_blocks[3] = {1, 1, 1};
1153            %(makecall)s
1154        }
1155        """ % locals(), file=sio)
1156
1157    def c_code_reduce_11(self, sio, node, name, x, z, fail):
1158        verbose = self.verbose
1159        makecall = self._makecall(node, name, x, z, fail)
1160        print("""
1161        {
1162            int verbose = %(verbose)s;
1163
1164            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
1165            while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
1166            n_threads[1] -= 1;
1167            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
1168                n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
1169
1170            size_t n_blocks[3] = {1, 1, 1};
1171            %(makecall)s
1172        }
1173        """ % locals(), file=sio)
1174
1175    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
1176        """
1177
1178        Parameters
1179        ----------
1180        N
1181            The number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
1182            Work for N=1,2,3.
1183
1184        """
1185
1186        assert N in [1, 2, 3]
1187        verbose = self.verbose
1188        in_dtype = "npy_" + node.inputs[0].dtype
1189        out_dtype = "npy_" + node.outputs[0].dtype
1190        makecall = self._makecall(node, name, x, z, fail)
1191        N_pattern = ''.join(['1'] * N)
1192        param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
1193                              for i in xrange(N + 1)])
1194        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)"
1195                                % (x, i, in_dtype) for i in xrange(N + 1)])
1196
1197        threads_y = """
1198            //get as many y threads as we can fit
1199            while (n_threads[0] * (n_threads[1]+1) <= 256)
1200            {
1201                if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
1202                    n_threads[1] += 1;
1203                else
1204                    break;
1205            }""" % locals()
1206
1207        threads_z = """
1208            //get as many z threads as we can fit
1209            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
1210            {
1211                if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
1212                    n_threads[2] += 1;
1213                else
1214                    break;
1215            }
1216            //Maximum for Fermi GPU on that dimensions.
1217            n_threads[2] = std::min(n_threads[2], (size_t)64);
1218        """ % locals()
1219
1220        if len(self.reduce_mask) == 2:
1221            threads_y = ''
1222            threads_z = ''
1223
1224        if len(self.reduce_mask) == 3:
1225            threads_z = ''
1226
1227        print("""
1228        {
1229            int verbose = %(verbose)s;
1230            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
1231            %(threads_y)s
1232            %(threads_z)s
1233            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
1234            %(makecall)s
1235        }
1236        """ % locals(), file=sio)
1237
1238    def c_code_reduce_01(self, sio, node, name, x, z, fail):
1239        self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
1240
1241    def c_code_reduce_011(self, sio, node, name, x, z, fail):
1242        self.c_code_reduce_01X(sio, node, name, x, z, fail, 2)
1243
1244    def c_code_reduce_0111(self, sio, node, name, x, z, fail):
1245        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
1246
1247    def c_code_reduce_10(self, sio, node, name, x, z, fail):
1248        verbose = self.verbose
1249        in_dtype = "npy_" + node.inputs[0].dtype
1250        out_dtype = "npy_" + node.outputs[0].dtype
1251        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
1252        k_var = "kernel_reduce_10_%(name)s" % locals()
1253        err_check = """
1254            if (err != GA_NO_ERROR) {
1255                PyErr_Format(PyExc_RuntimeError,
1256                             "gpuarray error: %(k_var)s: %%s.",
1257                             GpuKernel_error(%(k_var)s, err));
1258                %(fail)s;
1259            }
1260        """ % locals()
1261
1262        print("""
1263    {
1264        int verbose = %(verbose)s;
1265        if(PyGpuArray_STRIDES(%(x)s)[0]>
1266           PyGpuArray_STRIDES(%(x)s)[1]){
1267                // If there are a lot of summations to do, then we can use simple parallelization -
1268                // use each thread to do one sum.
1269
1270                // we might as well launch blocks of 32 threads because that's the warp size.
1271                // we could schedule more threads if we were maxing out the gridsize below, but
1272                // the gridsize is way more than the physical hardware and I think 32 threads
1273                // on a huge grid is enough to fully use the hardware.
1274                size_t n_threads[3] = {32, 1, 1};
1275
1276                // We kindof reshape the input implicitly to something 4D:
1277                //  the shape A,B,C    ->   A, B, D, E
1278                //  where C <= D*E < C+32
1279                //  where E==32
1280
1281                GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
1282                size_t A = 1;
1283                size_t B = PyGpuArray_DIMS(%(x)s)[0];
1284                size_t C = PyGpuArray_DIMS(%(x)s)[1];
1285                size_t D = C/32;
1286                if (32*D < C) D+= 1;
1287                assert ((C <= 32*D) && (32*D < C+32));
1288
1289                // The gridsize would ideally be (A, D).  But we do the following logic to make
1290                // sure we don't ask for a grid that is too big.
1291                size_t n_blocks[3] = {A, D, 1};
1292                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
1293                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
1294                ssize_t stride_A0 = 1;
1295                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
1296                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1297                ssize_t stride_Z0 = 1;
1298                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
1299                void *kernel_params[] = {
1300                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
1301                        (void *)%(x)s->ga.data,
1302                        (void *)&%(x)s->ga.offset,
1303                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
1304                        (void *)%(z)s->ga.data,
1305                        (void *)&%(z)s->ga.offset,
1306                        (void *)&stride_Z0, (void *)&stride_Z1};
1307                int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
1308                %(err_check)s
1309        }else{
1310            GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
1311            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
1312            size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
1313            if (verbose) {
1314              fprintf(stderr,
1315                "running kernel_reduce_10_%(name)s n_blocks=(%%llu,%%llu)\\n",
1316                (unsigned long long)n_blocks[0],
1317                (unsigned long long)n_blocks[1]);
1318            }
1319            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
1320            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
1321            size_t dim_0 = 1;
1322            ssize_t stride_A0 = 1;
1323            ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
1324            ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1325            ssize_t stride_Z0 = 1;
1326            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
1327            void *kernel_params[] = {
1328                    (void *)&dim_0,
1329                    (void *)&PyGpuArray_DIMS(%(x)s)[0],
1330                    (void *)&PyGpuArray_DIMS(%(x)s)[1],
1331                    (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
1332                    (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
1333                    (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
1334                    (void *)&stride_Z0, (void *)&stride_Z1};
1335            int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
1336            %(err_check)s
1337        }
1338    }
1339        """ % locals(), file=sio)
1340
1341    def c_code_reduce_010(self, sio, node, name, x, z, fail):
1342        verbose = self.verbose
1343        makecall = self._makecall(node, name, x, z, fail)
1344        makecall_inner = self._makecall(node, name, x, z, fail,
1345                                        pattern="010_inner")
1346        pattern = ''.join(str(i) for i in self.reduce_mask)
1347        in_dtype = "npy_" + node.inputs[0].dtype
1348        out_dtype = "npy_" + node.outputs[0].dtype
1349        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
1350        err_check = """
1351            if (err != GA_NO_ERROR) {
1352                PyErr_Format(PyExc_RuntimeError,
1353                             "gpuarray error: %(k_var)s: %%s.",
1354                             GpuKernel_error(&%(k_var)s, err));
1355                %(fail)s;
1356            }
1357        """ % locals()
1358        print("""
1359        {
1360            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
1361
1362            //if ((n_summations >= 15 * 32) && (PyGpuArray_DIMS(%(x)s)[2]>=16))
1363            if (1) // if the alternative is less buggy, consider not using this branch
1364            {
1365                // If there are a lot of summations to do, then we can use simple parallelization -
1366                // use each thread to do one sum.
1367
1368                // we might as well launch blocks of 32 threads because that's the warp size.
1369                // we could schedule more threads if we were maxing out the gridsize below, but
1370                // the gridsize is way more than the physical hardware and I think 32 threads
1371                // on a huge grid is enough to fully use the hardware.
1372                size_t n_threads[3] = {32, 1, 1};
1373
1374                // We kindof reshape the input implicitly to something 4D:
1375                //  the shape A,B,C    ->   A, B, D, E
1376                //  where C <= D*E < C+32
1377                //  where E==32
1378
1379                size_t A = PyGpuArray_DIMS(%(x)s)[0];
1380                size_t B = PyGpuArray_DIMS(%(x)s)[1];
1381                size_t C = PyGpuArray_DIMS(%(x)s)[2];
1382                size_t D = C/32;
1383                if (32*D < C) D+= 1;
1384                assert ((C <= 32*D) && (32*D < C+32));
1385
1386                // The gridsize would ideally be (A, D).  But we do the following logic to make
1387                // sure we don't ask for a grid that is too big.
1388                size_t n_blocks[3] = {A, D, 1};
1389                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
1390                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
1391                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
1392                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1393                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
1394                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
1395                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
1396                void *kernel_params[] = {
1397                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
1398                        (void *)%(x)s->ga.data,
1399                        (void *)&%(x)s->ga.offset,
1400                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
1401                        (void *)%(z)s->ga.data,
1402                        (void *)&%(z)s->ga.offset,
1403                        (void *)&stride_Z0, (void *)&stride_Z1};
1404                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
1405                %(err_check)s
1406            }
1407            else
1408            {
1409                int verbose = %(verbose)s;
1410
1411                  size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
1412                  while(    (n_threads[0]*(n_threads[1]+1)<=256)
1413                         && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
1414                      n_threads[1]++;
1415                  }
1416
1417                  size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
1418                  n_blocks[1] = std::min(
1419                      ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
1420                                  (size_t)n_threads[0]),
1421                      (size_t)(4096 / n_blocks[0])
1422                      );
1423                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
1424                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
1425                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
1426                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
1427                  && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
1428                                             (size_t)n_threads[0])){
1429                  if(verbose>1)
1430                    printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
1431                           PyGpuArray_DIMS(%(x)s)[0],4096,
1432                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
1433                                       (size_t)(4096 / n_blocks[0]));
1434                  assert(n_threads[0]<=32);
1435                  %(makecall_inner)s
1436                }else{
1437                  n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
1438                                         (size_t) 256);
1439                  n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
1440                  n_blocks[1] = std::min(
1441                      PyGpuArray_DIMS(%(x)s)[2],
1442                      (size_t)(4096 / n_blocks[0])
1443                      );
1444                  %(makecall)s
1445                }
1446            }
1447        }
1448        """ % locals(), file=sio)
1449
1450    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
1451        verbose = self.verbose
1452        makecall = self._makecall(node, name, x, z, fail)
1453        print("""
1454        {
1455            int verbose = %(verbose)s;
1456            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
1457            while (n_threads[0] * n_threads[1] <= 256)
1458            {
1459                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
1460                n_threads[1] += 1;
1461            }
1462            n_threads[1] -= 1;
1463            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
1464            %(makecall)s
1465        }
1466        """ % locals(), file=sio)
1467
1468    def c_code_reduce_100(self, sio, node, name, x, z, fail):
1469        verbose = self.verbose
1470        makecall = self._makecall(node, name, x, z, fail)
1471        in_dtype = "npy_" + node.inputs[0].dtype
1472        out_dtype = "npy_" + node.outputs[0].dtype
1473        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
1474        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
1475        err_check = """
1476            if (err != GA_NO_ERROR) {
1477                PyErr_Format(PyExc_RuntimeError,
1478                             "gpuarray error: %(k_var)s: %%s.",
1479                             GpuKernel_error(&%(k_var)s, err));
1480                %(fail)s;
1481            }
1482        """ % locals()
1483        # use threadIdx.x for i0
1484        # use blockIdx.x for i1
1485        # use blockIdx.y for i2
1486        print("""
1487        {
1488            int verbose = %(verbose)s;
1489            if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
1490                size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
1491                size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
1492                while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
1493                       n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
1494                {
1495                    n_blocks[1] += 1;
1496                }
1497                %(makecall)s
1498            }
1499            else
1500            {   // reuse 010_AD kernel, we transpose the 2 first dim
1501                // See the reduction for the real 010_AD kernel for
1502                // explanation. We do this to get coalesced read.
1503                size_t n_threads[3] = {32, 1, 1};
1504
1505                size_t A = PyGpuArray_DIMS(%(x)s)[1];
1506                size_t B = PyGpuArray_DIMS(%(x)s)[0];
1507                size_t C = PyGpuArray_DIMS(%(x)s)[2];
1508                size_t D = C/32;
1509                if (32*D < C) D+= 1;
1510                assert ((C <= 32*D) && (32*D < C+32));
1511
1512                // The gridsize would ideally be (A, D).  But we do the following logic to make
1513                // sure we don't ask for a grid that is too big.
1514                size_t n_blocks[3] = {A, D, 1};
1515                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
1516                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
1517                size_t n_shared = 0;
1518                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1519                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
1520                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
1521                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
1522                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
1523                void *kernel_params[] = {
1524                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
1525                        (void *)%(x)s->ga.data,
1526                        (void *)&%(x)s->ga.offset,
1527                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
1528                        (void *)%(z)s->ga.data,
1529                        (void *)&%(z)s->ga.offset,
1530                        (void *)&stride_Z0, (void *)&stride_Z1};
1531                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
1532                %(err_check)s
1533            }
1534        }
1535        """ % locals(), file=sio)
1536
1537    def c_code_reduce_110(self, sio, node, name, x, z, fail):
1538        verbose = self.verbose
1539        makecall = self._makecall(node, name, x, z, fail)
1540        print("""
1541        {
1542            int verbose = %(verbose)s;
1543            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
1544            while (n_threads[0]*n_threads[1] <= 256)
1545            {
1546                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
1547                    break;
1548                n_threads[1] += 1;
1549            }
1550            n_threads[1] -= 1;
1551
1552            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
1553            %(makecall)s
1554        }
1555        """ % locals(), file=sio)
1556
1557    def c_code_reduce_001(self, sio, node, name, x, z, fail):
1558        verbose = self.verbose
1559        makecall = self._makecall(node, name, x, z, fail)
1560        print("""
1561        {
1562            int verbose = %(verbose)s;
1563            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
1564            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
1565            while (n_blocks[0] * n_blocks[1] <= 4096)
1566            {
1567                if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
1568                    break;
1569                n_blocks[1] += 1;
1570            }
1571            n_blocks[1] -= 1;
1572            %(makecall)s
1573        }
1574        """ % locals(), file=sio)
1575
1576    def c_code_reduce_101(self, sio, node, name, x, z, fail):
1577        verbose = self.verbose
1578        makecall = self._makecall(node, name, x, z, fail,
1579                                  extra_dims=[("size_t one = 1;", "(void *) &one")],
1580                                  extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
1581                                  pattern="1011")
1582        print("""
1583        {
1584            int verbose = %(verbose)s;
1585//            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
1586//                                            (size_t) 256), 1, 1};
1587            size_t n_threads[3] = {1, 1, 1};
1588
1589            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
1590            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
1591                n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
1592
1593            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
1594                ++n_threads[2];
1595            if (n_threads[2] > 64)
1596                n_threads[2] = 64;
1597            if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
1598                n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
1599
1600            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
1601            %(makecall)s
1602        }
1603        """ % locals(), file=sio)
1604
1605    def c_code_reduce_111(self, sio, node, name, x, z, fail):
1606        verbose = self.verbose
1607        makecall = self._makecall(node, name, x, z, fail)
1608        print("""
1609        {
1610            int verbose = %(verbose)s;
1611            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
1612
1613            //get as many y threads as we can fit
1614            while (n_threads[0] * n_threads[1] <= 256)
1615            {
1616                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
1617                    break;
1618                n_threads[1] += 1;
1619            }
1620            n_threads[1] -= 1;
1621
1622            //get as many z threads as we can fit
1623            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
1624            {
1625                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
1626                    break;
1627                n_threads[2] += 1;
1628            }
1629            n_threads[2] -= 1;
1630            //Maximum for Fermi GPU on that dimensions.
1631            n_threads[2] = std::min(n_threads[2], (size_t)64);
1632
1633            size_t n_blocks[3] = {1, 1, 1};
1634            %(makecall)s
1635        }
1636        """ % locals(), file=sio)
1637
1638    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
1639        verbose = self.verbose
1640        makecall = self._makecall(node, name, x, z, fail)
1641        in_dtype = "npy_" + node.inputs[0].dtype
1642        out_dtype = "npy_" + node.outputs[0].dtype
1643        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
1644        print("""
1645        {
1646            int verbose = %(verbose)s;
1647
1648            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
1649
1650            while (n_blocks[0] * n_blocks[1] <= 4096 &&
1651                   n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
1652            {
1653                n_blocks[1] += 1;
1654            }
1655
1656            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
1657            while (n_threads[0] * n_threads[1] <= 256
1658                   && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
1659                   && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
1660            {
1661                n_threads[1] += 1;
1662            }
1663
1664            %(makecall)s
1665        }
1666        """ % locals(), file=sio)
1667
1668    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
1669        verbose = self.verbose
1670        makecall = self._makecall(node, name, x, z, fail)
1671        print("""
1672        {
1673            int verbose = %(verbose)s;
1674            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
1675
1676            //get as many y threads as we can fit
1677            while (n_threads[0] * n_threads[1] <= 256)
1678            {
1679                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
1680                    break;
1681                n_threads[1] += 1;
1682            }
1683            n_threads[1] -= 1;
1684
1685            //get as many z threads as we can fit
1686            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
1687            {
1688                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
1689                    break;
1690                n_threads[2] += 1;
1691            }
1692            n_threads[2] -= 1;
1693
1694            //Maximum for Fermi GPU on that dimensions.
1695            n_threads[2] = std::min(n_threads[2], (size_t)64);
1696
1697            size_t n_blocks[3] = {1, 1, 1};
1698            %(makecall)s
1699        }
1700        """ % locals(), file=sio)
1701
1702    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
1703        verbose = self.verbose
1704        makecall = self._makecall(node, name, x, z, fail)
1705        print("""
1706        {
1707            int verbose = %(verbose)s;
1708            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
1709
1710            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
1711            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
1712                n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
1713
1714            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
1715            if (n_threads[2] > 64)
1716                n_threads[2] = 64;
1717            if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
1718                n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
1719
1720            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
1721            %(makecall)s
1722        }
1723        """ % locals(), file=sio)
1724
1725    def c_code_cache_version_apply(self, node):
1726        version = [24, self.verbose]  # the version corresponding to the c code in this Op
1727
1728        # now we insert versions for the ops on which we depend...
1729        scalar_node = Apply(
1730            self.scalar_op,
1731            [Scalar(dtype=input.type.dtype)() for input in node.inputs],
1732            [Scalar(dtype=output.type.dtype)() for output in node.outputs])
1733        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
1734        for i in node.inputs + node.outputs:
1735            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
1736        version.extend(self.kernel_version(node))
1737        if all(version):
1738            return tuple(version)
1739        else:
1740            return ()
1741
1742    def gpu_kernels(self, node, nodename):
1743        nd_in = len(self.reduce_mask)
1744        in_dtype = node.inputs[0].dtype
1745        out_dtype = node.outputs[0].dtype
1746        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
1747        assign_dtype = in_dtype
1748        flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
1749        in_type = gpuarray.dtype_to_ctype(in_dtype)
1750        out_type = gpuarray.dtype_to_ctype(out_dtype)
1751        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
1752        load_in = load_w(in_dtype)
1753        write_out = write_w(out_dtype)
1754        kernels = []
1755
1756        if all(i == 1 for i in self.reduce_mask):
1757            # this kernel is ok for up to a few thousand elements, but
1758            # it only runs on ONE multiprocessor
1759            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
1760            reduce_fct = self._assign_reduce(node, nodename, "myresult",
1761                                             load_in + "(A[i0])",
1762                                             {}, True)
1763            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
1764            kname = "kernel_reduce_ccontig"
1765            k_var = "kernel_reduce_ccontig_" + nodename
1766            sio = StringIO()
1767            print("""#include "cluda.h"
1768
1769            KERNEL void %(kname)s(
1770                    const ga_size d0,
1771                    const %(in_type)s *A, const ga_size offset_A,
1772                    %(out_type)s *Z, const ga_size offset_Z)
1773            {
1774                const int threadCount = blockDim.x;
1775                const int threadNum = threadIdx.x;
1776                extern __shared__ %(acc_type)s buf[];
1777                A = (const %(in_type)s *)(((char *)A)+offset_A);
1778                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
1779                %(acc_type)s myresult = %(reduce_init)s;
1780
1781                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
1782                {
1783                    %(reduce_fct)s
1784                }
1785                %(reducebuf)s
1786            }
1787            """ % locals(), file=sio)
1788            params = [
1789                'uintp',
1790                gpuarray.GpuArray, 'uintp',
1791                gpuarray.GpuArray, 'uintp'
1792                ]
1793            kernels.append(Kernel(code=sio.getvalue(), name=kname,
1794                                  params=params, flags=flags, objvar=k_var))
1795        if self.reduce_mask == (1,):
1796            # this kernel is ok for up to a few thousand elements, but
1797            # it only runs on ONE multiprocessor
1798            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
1799            reduce_fct = self._assign_reduce(node, nodename, "myresult",
1800                                             load_in + "(A[i0 * sA0])",
1801                                             {}, True)
1802            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
1803            kname = "kernel_reduce_1"
1804            k_var = "kernel_reduce_1_" + nodename
1805            sio = StringIO()
1806            print("""#include "cluda.h"
1807
1808            KERNEL void %(kname)s(
1809                    const ga_size d0,
1810                    const %(in_type)s *A, const ga_size offset_A,
1811                    const ga_ssize sA0,
1812                    %(out_type)s * Z, const ga_size offset_Z)
1813            {
1814                const int threadCount = blockDim.x;
1815                const int threadNum = threadIdx.x;
1816                extern __shared__ %(acc_type)s buf[];
1817                A = (const %(in_type)s *)(((char *)A)+offset_A);
1818                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
1819                %(acc_type)s myresult = %(reduce_init)s;
1820
1821                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
1822                {
1823                    %(reduce_fct)s
1824                }
1825                %(reducebuf)s
1826            }
1827            """ % locals(), file=sio)
1828            params = [
1829                'uintp',
1830                gpuarray.GpuArray, 'uintp',
1831                'intp',
1832                gpuarray.GpuArray, 'uintp'
1833                ]
1834            kernels.append(Kernel(code=sio.getvalue(), name=kname,
1835                                  params=params, flags=flags, objvar=k_var))
1836        if self.reduce_mask == (1, 1):
1837            # this kernel is ok for up to a few thousand elements, but
1838            # it only runs on ONE multiprocessor
1839            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
1840            reduce_fct = self._assign_reduce(node, nodename, "myresult",
1841                                             load_in + "(A[i0 * sA0 + i1 * sA1])",
1842                                             {}, True)
1843            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
1844            kname = "kernel_reduce_11"
1845            k_var = "kernel_reduce_11_" + nodename
1846            sio = StringIO()
1847            print("""#include "cluda.h"
1848
1849            KERNEL void %(kname)s(
1850                    const ga_size d0, const ga_size d1,
1851                    const %(in_type)s *A, const ga_size offset_A,
1852                    const ga_ssize sA0, const ga_ssize sA1,
1853                    %(out_type)s * Z, const ga_size offset_Z)
1854            {
1855                const int threadCount = blockDim.x * blockDim.y;
1856                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
1857                extern __shared__ %(acc_type)s buf[];
1858                A = (const %(in_type)s *)(((char *)A)+offset_A);
1859                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
1860                %(acc_type)s myresult = %(reduce_init)s;
1861
1862                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
1863                {
1864                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
1865                    {
1866                        %(reduce_fct)s;
1867                    }
1868                }
1869                %(reducebuf)s
1870            }
1871            """ % locals(), file=sio)
1872            params = [
1873                'uintp', 'uintp',
1874                gpuarray.GpuArray, 'uintp',
1875                'intp', 'intp',
1876                gpuarray.GpuArray, 'uintp'
1877                ]
1878            kernels.append(Kernel(code=sio.getvalue(), name=kname,
1879                                  params=params, flags=flags, objvar=k_var))
1880        # 01, 011, 0111
1881        if (0 == self.reduce_mask[0] and
1882                all(self.reduce_mask[1:]) and
1883                nd_in in[2, 3, 4]):
1884            # this kernel uses one block for each row.
1885            # threads per block for each element per row.
1886
1887            N_pattern = ''.join(['1'] * (nd_in - 1))
1888            # TODO: is it faster to hardcode sA3, etc. in the later
1889            # code, rather than have the for_* variables declare them
1890            # and the later code use their names?
1891            if nd_in == 2:
1892                for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
1893                first_i1 = 'threadIdx.x'
1894                sA1 = 'sA1'
1895                for_i2 = "int i2=0, sA2=0;"
1896                sA2 = '0'
1897                first_i2 = '0'
1898                for_i3 = "int i3=0, sA3=0;"
1899                sA3 = '0'
1900                first_i3 = '0'
1901            if nd_in == 3:
1902                for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
1903                first_i1 = 'threadIdx.y'
1904                sA1 = 'sA1'
1905                for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
1906                first_i2 = 'threadIdx.x'
1907                sA2 = 'sA2'
1908                for_i3 = "int i3=0, sA3=0;"
1909                first_i3 = 0
1910                sA3 = '0'
1911            if nd_in == 4:
1912                for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
1913                first_i1 = 'threadIdx.z'
1914                sA1 = 'sA1'
1915                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
1916                first_i2 = 'threadIdx.y'
1917                sA2 = 'sA2'
1918                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
1919                first_i3 = 'threadIdx.x'
1920                sA3 = 'sA3'
1921
1922            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
1923                                           nodename, sub={})
1924            param_dim = ",".join(["const ga_size d%d" % i
1925                                  for i in xrange(nd_in)])
1926            param_strides = ",".join(["const ga_ssize sA%d" % i
1927                                      for i in xrange(nd_in)])
1928            decl, kname, params, k_var = self._k_decl(node, nodename)
1929            init = self._k_init(node, nodename)
1930            reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals(), assign_dtype)
1931            reduce_fct = self._assign_reduce(
1932                node, nodename, "myresult",
1933                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
1934                {}, True)
1935            sio = StringIO()
1936            print("""#include "cluda.h"
1937
1938                %(decl)s{
1939                    %(init)s
1940                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1941                      myresult = %(reduce_init)s;
1942                      %(for_i1)s{
1943                        %(for_i2)s{
1944                          %(for_i3)s{
1945                            %(reduce_fct)s;
1946                          }
1947                        }
1948                      }
1949                      %(reducebuf)s
1950                    }
1951                }
1952                """ % locals(), file=sio)
1953            kernels.append(Kernel(code=sio.getvalue(), name=kname,
1954                                  params=params, flags=flags, objvar=k_var))
1955        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
1956            # this kernel uses one block for each column,
1957            # threads per block for each element per column.
1958
1959            # TODO: This kernel is pretty inefficient in terms of reading, because if A is
1960            #      c_contiguous (typical case) then each warp is accessing non-contigous
1961            #      memory (a segment of a column).
1962            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
1963                                           node, nodename, sub={})
1964            reduce_fct = self._assign_reduce(node, nodename, "myresult",
1965                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
1966                                             {}, True)
1967            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])", assign_dtype)
1968            kname = "kernel_reduce_010"
1969            k_var = "kernel_reduce_010_" + nodename
1970            sio = StringIO()
1971            print("""#include "cluda.h"
1972
1973            KERNEL void %(kname)s(
1974                    const ga_size d0, const ga_size d1, const ga_size d2,
1975                    const %(in_type)s *A, const ga_size offset_A,
1976                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
1977                    %(out_type)s * Z, const ga_size offset_Z,
1978                    const ga_ssize sZ0, const ga_ssize sZ1)
1979            {
1980                const int threadCount = blockDim.x;
1981                const int threadNum = threadIdx.x;
1982                extern __shared__ %(acc_type)s buf[];
1983                A = (const %(in_type)s *)(((char *)A)+offset_A);
1984                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
1985
1986                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1987                {
1988                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
1989                    {
1990                        %(acc_type)s myresult = %(reduce_init)s;
1991                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
1992                        {
1993                            %(reduce_fct)s;
1994                        }
1995                        %(reducebuf)s
1996                    }
1997                }
1998
1999            }
2000            """ % locals(), file=sio)
2001            params = [
2002                'uintp', 'uintp', 'uintp',
2003                gpuarray.GpuArray, 'uintp',
2004                'intp', 'intp', 'intp',
2005                gpuarray.GpuArray, 'uintp',
2006                'intp', 'intp'
2007                ]
2008            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2009                                  params=params, flags=flags, objvar=k_var))
2010        if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
2011            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2012                                             load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
2013                                             {}, True)
2014            reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])", assign_dtype)
2015            kname = "kernel_reduce_010_AD"
2016            k_var = "kernel_reduce_010_AD_" + nodename
2017            sio = StringIO()
2018            print("""#include "cluda.h"
2019
2020            KERNEL void %(kname)s(
2021                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
2022                    const %(in_type)s *X, const ga_size offset_X,
2023                    const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
2024                    %(out_type)s * Z, const ga_size offset_Z,
2025                    const ga_ssize sZ0, const ga_ssize sZ1)
2026            {
2027                const int threadCount = blockDim.x;
2028                const int threadNum = threadIdx.x;
2029                X = (const %(in_type)s *)(((char *)X)+offset_X);
2030                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
2031                %(acc_type)s myresult = 0;
2032
2033                for (int a = blockIdx.x; a < A; a += gridDim.x)
2034                {
2035                    for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
2036                    {
2037                        int c = i2_D * 32 + threadIdx.x;
2038                        if (c < C)
2039                        {
2040                            myresult = %(reduce_init)s;
2041                            for (int b = 0; b < B; ++b)
2042                            {
2043                                %(reduce_fct)s;
2044                            }
2045                            Z[a * sZ0 + c * sZ1] = %(write_out)s(myresult);
2046                        }
2047                    }
2048                }
2049
2050            }
2051            """ % locals(), file=sio)
2052            params = [
2053                'uintp', 'uintp', 'uintp', 'uintp',
2054                gpuarray.GpuArray, 'uintp',
2055                'intp', 'intp', 'intp',
2056                gpuarray.GpuArray, 'uintp',
2057                'intp', 'intp'
2058                ]
2059            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2060                                  params=params, flags=flags, objvar=k_var))
2061        if self.reduce_mask == (0, 1, 0):
2062            #
2063            # This kernel is optimized when the inner most dimensions
2064            # have the smallest stride.
2065
2066            # this kernel uses one block for multiple column(up to 32TODO),
2067            # threads per block for each element per column.
2068
2069            # thread.x = dim 2 contiguous
2070            # thread.y = dim 1
2071            # block.x = dim 0
2072            # block.y = dim 1 rest
2073            init = self._k_init(node, nodename)
2074            decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
2075            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
2076                                                    node, nodename,
2077                                                    'blockDim.x')
2078            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2079                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
2080                                             {}, True)
2081            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])", assign_dtype)
2082            sio = StringIO()
2083            print("""#include "cluda.h"
2084
2085            %(decl)s
2086            {
2087              %(init)s
2088              for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
2089              {
2090                for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
2091                 {
2092                  myresult = %(reduce_init)s;
2093                  for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
2094                  {
2095                      %(reduce_fct)s;
2096                  }
2097                  %(reducebuf)s
2098                 }
2099              }
2100            }
2101            """ % locals(), file=sio)
2102            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2103                                  params=params, flags=flags, objvar=k_var))
2104        if self.reduce_mask == (1, 1, 0):
2105            # this kernel uses one block for each column,
2106            # threads per block for each element per column.
2107
2108            # TODO: This kernel is pretty inefficient in terms of reading, because if A is
2109            #      c_contiguous (typical case) then each warp is accessing non-contigous
2110            #      memory (a segment of a column).
2111            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub={})
2112            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2113                                             load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
2114                                             {}, True)
2115            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])", assign_dtype)
2116            kname = "kernel_reduce_110"
2117            k_var = "kernel_reduce_110_" + nodename
2118            sio = StringIO()
2119            print("""#include "cluda.h"
2120
2121            KERNEL void %(kname)s(
2122                    const ga_size d0, const ga_size d1, const ga_size d2,
2123                    const %(in_type)s *A, const ga_size offset_A,
2124                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
2125                    %(out_type)s * Z, const ga_size offset_Z,
2126                    const ga_ssize sZ0)
2127            {
2128                const int threadCount = blockDim.x * blockDim.y;
2129                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
2130                extern __shared__ %(acc_type)s buf[];
2131                A = (const %(in_type)s *)(((char *)A)+offset_A);
2132                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
2133                %(acc_type)s myresult = %(reduce_init)s;
2134
2135                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
2136                {
2137                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
2138                    {
2139                        %(reduce_fct)s;
2140                    }
2141                }
2142
2143                %(reducebuf)s
2144            }
2145            """ % locals(), file=sio)
2146            params = [
2147                'uintp', 'uintp', 'uintp',
2148                gpuarray.GpuArray, 'uintp',
2149                'intp', 'intp', 'intp',
2150                gpuarray.GpuArray, 'uintp',
2151                'intp'
2152                ]
2153            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2154                                  params=params, flags=flags, objvar=k_var))
2155        if self.reduce_mask == (1, 0, 0):
2156            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
2157                                           node, nodename, sub={})
2158            decl, kname, params, k_var = self._k_decl(node, nodename)
2159            init = self._k_init(node, nodename)
2160            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2161                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
2162                                             {}, True)
2163            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])", assign_dtype)
2164            sio = StringIO()
2165            print("""#include "cluda.h"
2166
2167            %(decl)s
2168            {
2169                %(init)s
2170                for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
2171                {
2172                    for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
2173                    {
2174                        myresult = %(reduce_init)s;
2175                        for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
2176                        {
2177                            %(reduce_fct)s
2178                        }
2179                        %(reducebuf)s
2180                    }
2181                }
2182            }
2183            """ % locals(), file=sio)
2184            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2185                                  params=params, flags=flags, objvar=k_var))
2186        if self.reduce_mask == (1, 1, 1):
2187            reducebuf = self._k_reduce_buf('Z[0]', node,
2188                                           nodename, sub={})
2189            decl, kname, params, k_var = self._k_decl(node, nodename)
2190            init = self._k_init(node, nodename)
2191            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2192                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
2193                                             {}, True)
2194            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
2195            sio = StringIO()
2196            print("""#include "cluda.h"
2197
2198            %(decl)s
2199            {
2200                %(init)s
2201                myresult = %(reduce_init)s;
2202                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
2203                {
2204                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
2205                    {
2206                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
2207                        {
2208                            %(reduce_fct)s;
2209                        }
2210                    }
2211                }
2212                %(reducebuf)s
2213            }
2214            """ % locals(), file=sio)
2215            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2216                                  params=params, flags=flags, objvar=k_var))
2217        if self.reduce_mask == (0, 0, 1):
2218            # this kernel uses one block for each row,
2219            # threads per block for each element per row.
2220            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
2221                                           node, nodename, sub={})
2222            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2223                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
2224                                             {}, True)
2225            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
2226            kname = "kernel_reduce_001"
2227            k_var = "kernel_reduce_001_" + nodename
2228            sio = StringIO()
2229            print("""#include "cluda.h"
2230            KERNEL void %(kname)s(
2231                    const ga_size d0, const ga_size d1, const ga_size d2,
2232                    const %(in_type)s *A, const ga_size offset_A,
2233                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
2234                    %(out_type)s * Z, const ga_size offset_Z,
2235                    const ga_ssize sZ0, const ga_ssize sZ1)
2236            {
2237                const int threadCount = blockDim.x;
2238                const int threadNum = threadIdx.x;
2239                extern __shared__ %(acc_type)s buf[];
2240                A = (const %(in_type)s *)(((char *)A)+offset_A);
2241                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
2242
2243                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
2244                {
2245                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
2246                    {
2247                        %(acc_type)s myresult = %(reduce_init)s;
2248                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
2249                        {
2250                            %(reduce_fct)s;
2251                        }
2252                        %(reducebuf)s
2253                    }
2254                }
2255            }
2256            """ % locals(), file=sio)
2257            params = [
2258                'uintp', 'uintp', 'uintp',
2259                gpuarray.GpuArray, 'uintp',
2260                'intp', 'intp', 'intp',
2261                gpuarray.GpuArray, 'uintp',
2262                'intp', 'intp'
2263                ]
2264            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2265                                  params=params, flags=flags, objvar=k_var))
2266        if self.reduce_mask == (0, 0, 1, 1):
2267            # this kernel uses one block for each row,
2268            # threads per block for each element per row.
2269            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
2270                                           node, nodename, sub={})
2271            decl, kname, params, k_var = self._k_decl(node, nodename)
2272            init = self._k_init(node, nodename)
2273            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2274                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
2275                                             {}, True)
2276            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
2277            sio = StringIO()
2278            print("""#include "cluda.h"
2279
2280            %(decl)s
2281            {
2282                %(init)s
2283
2284                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
2285                {
2286                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
2287                    {
2288                        %(acc_type)s myresult = %(reduce_init)s;
2289                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
2290                    {
2291                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
2292                        {
2293                            %(reduce_fct)s;
2294                        }
2295                    }
2296                        %(reducebuf)s
2297                    }
2298                }
2299            }
2300            """ % locals(), file=sio)
2301            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2302                                  params=params, flags=flags, objvar=k_var))
2303        if self.reduce_mask == (0, 1, 0, 1):
2304            # this kernel uses one block for each row,
2305            # threads per block for each element per row.
2306            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
2307                                           node, nodename, sub={})
2308            decl, kname, params, k_var = self._k_decl(node, nodename)
2309            init = self._k_init(node, nodename)
2310            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2311                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
2312                                             {}, True)
2313            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])", assign_dtype)
2314            sio = StringIO()
2315            print("""#include "cluda.h"
2316
2317            %(decl)s
2318            {
2319                %(init)s
2320
2321                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
2322                {
2323                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
2324                    {
2325                        %(acc_type)s myresult = %(reduce_init)s;
2326                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
2327                    {
2328                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
2329                        {
2330                            %(reduce_fct)s;
2331                        }
2332                    }
2333                        %(reducebuf)s
2334                    }
2335                }
2336            }
2337            """ % locals(), file=sio)
2338            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2339                                  params=params, flags=flags, objvar=k_var))
2340        if self.reduce_mask == (1, 1, 1, 1):
2341            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
2342                                           sub={})
2343            decl, kname, params, k_var = self._k_decl(node, nodename)
2344            init = self._k_init(node, nodename)
2345            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2346                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
2347                                             {}, True)
2348            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
2349            sio = StringIO()
2350            print("""#include "cluda.h"
2351
2352            %(decl)s
2353            {
2354                %(init)s
2355                myresult = %(reduce_init)s;
2356              for (int i0 = 0; i0 < d0; i0++)
2357                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
2358                {
2359                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
2360                    {
2361                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
2362                        {
2363                            %(reduce_fct)s;
2364                        }
2365                    }
2366                }
2367                %(reducebuf)s
2368            }
2369            """ % locals(), file=sio)
2370            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2371                                  params=params, flags=flags, objvar=k_var))
2372        if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1):
2373            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
2374                                           node, nodename, sub={})
2375            reduce_fct = self._assign_reduce(node, nodename, "myresult",
2376                                             load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
2377                                             {}, True)
2378            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])", assign_dtype)
2379            kname = "kernel_reduce_1011"
2380            k_var = "kernel_reduce_1011_" + nodename
2381            sio = StringIO()
2382            print("""#include "cluda.h"
2383
2384            KERNEL void %(kname)s(
2385                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
2386                    const %(in_type)s *A, const ga_size offset_A,
2387                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
2388                    %(out_type)s * Z, const ga_size offset_Z,
2389                    const ga_ssize sZ0)
2390            {
2391                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
2392                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
2393                extern __shared__ %(acc_type)s buf[];
2394                A = (const %(in_type)s *)(((char *)A)+offset_A);
2395                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
2396                %(acc_type)s myresult = %(reduce_init)s;
2397
2398                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
2399                {
2400                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
2401                    {
2402                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
2403                        {
2404                            %(reduce_fct)s;
2405                        }
2406                    }
2407                }
2408                %(reducebuf)s
2409            }
2410            """ % locals(), file=sio)
2411            params = [
2412                'uintp', 'uintp', 'uintp', 'uintp',
2413                gpuarray.GpuArray, 'uintp',
2414                'intp', 'intp', 'intp', 'intp',
2415                gpuarray.GpuArray, 'uintp',
2416                'intp'
2417                ]
2418            kernels.append(Kernel(code=sio.getvalue(), name=kname,
2419                                  params=params, flags=flags, objvar=k_var))
2420        return kernels
2421
2422
2423class GpuErfinv(Erfinv):
2424    """
2425    Inverse error function for GPU.
2426
2427    """
2428
2429    def c_headers(self):
2430        return ['math_functions.h', 'cublas_v2.h']
2431
2432    def c_code(self, node, name, inp, out, sub):
2433        x, = inp
2434        z, = out
2435        if node.inputs[0].type in complex_types:
2436            raise NotImplementedError('type not supported', type)
2437        # NB: CUDA erfinv function (GPU op) returns NaN if x not in [-1;1],
2438        # while `scipy.special.erfinv` (CPU op) returns an infinite (-inf if x < -1, +inf if x > 1).
2439        # For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions
2440        # to ensure that GPU op returns the same values as CPU op.
2441        return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals()
2442gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
2443
2444
2445class GpuErfcinv(Erfcinv):
2446    """
2447    Inverse complementary error function for GPU.
2448
2449    """
2450
2451    def c_headers(self):
2452        return ['math_functions.h', 'cublas_v2.h']
2453
2454    def c_code(self, node, name, inp, out, sub):
2455        x, = inp
2456        z, = out
2457        if node.inputs[0].type in complex_types:
2458            raise NotImplementedError('type not supported', type)
2459        # NB: CUDA erfcinv function (GPU op) returns NaN if x not in [0;2],
2460        # while `scipy.special.erfcinv` (CPU op) returns an infinite (+inf if x < 0, -inf if x > 2).
2461        # For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions
2462        # to ensure that GPU op returns the same values as CPU op.
2463        return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals()
2464gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv')
2465
2466
2467# Caching GpuCAReduceCuda
2468def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
2469                       pre_scalar_op=None):
2470    key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
2471           pre_scalar_op)
2472    if key not in gpu_ca_reduce_cuda.cache:
2473        gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
2474                                                        acc_dtype, pre_scalar_op)
2475    return gpu_ca_reduce_cuda.cache[key]
2476gpu_ca_reduce_cuda.cache = {}
2477
2478
2479class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
2480    """
2481    CAReduce that reuse the python code from gpuarray.
2482
2483    """
2484    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
2485        if not hasattr(scalar_op, 'identity'):
2486            raise ValueError("No identity on scalar op")
2487        CAReduceDtype.__init__(self, scalar_op, axis=axis, dtype=dtype,
2488                               acc_dtype=acc_dtype)
2489
2490    def __str__(self):
2491        ax = ''
2492        if self.axis is not None:
2493            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
2494        return "GpuReduce{%s}%s" % (self.scalar_op, ax)
2495
2496    def make_node(self, input):
2497        ctx_name = infer_context_name(input)
2498        res = CAReduceDtype.make_node(self, input)
2499        input = as_gpuarray_variable(input, ctx_name)
2500        otype = GpuArrayType(dtype=res.outputs[0].dtype,
2501                             broadcastable=res.outputs[0].broadcastable,
2502                             context_name=ctx_name)
2503
2504        if res.op.axis is not None:
2505            redux = []
2506            for i in range(len(input.type.broadcastable)):
2507                redux.append(i in res.op.axis)
2508                # since redux is just another way to describe what is in axis
2509                # it doesn't need to be compared in __eq__ or __hash__
2510            res.op.redux = redux
2511
2512        return Apply(res.op, [input], [otype()])
2513
2514    def get_params(self, node):
2515        return node.outputs[0].type.context
2516
2517    def prepare_node(self, node, storage_map, compute_map, impl):
2518        # cache the kernel object
2519        self.get_kernel_cache(node)
2520
2521    def get_kernel_cache(self, node):
2522        attr = '@cache_reduction_k'
2523        if self.axis is None:
2524            redux = [True] * node.inputs[0].ndim
2525        else:
2526            redux = self.redux
2527        if not hasattr(node, attr):
2528            acc_dtype = getattr(self, 'acc_dtype', None)
2529            if acc_dtype is None:
2530                acc_dtype = node.outputs[0].type.dtype
2531            if any(redux):
2532                setattr(node, attr, self.generate_kernel(node, acc_dtype,
2533                                                         redux))
2534
2535        if any(redux):
2536            return getattr(node, attr)
2537
2538    def gpu_kernels(self, node, name):
2539        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
2540            # Some OpenCL compilers do not accept no-arguments empty kernels
2541            src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
2542            params = ['float32']
2543        else:
2544            k = self.get_kernel_cache(node)
2545            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
2546                                               node.inputs[0].ndim)
2547            nd = node.inputs[0].ndim
2548            params = ['uint32', gpuarray.GpuArray, 'uint32']
2549            params.extend('uint32' for _ in range(nd))
2550            params.append(gpuarray.GpuArray)
2551            params.append('uint32')
2552            params.extend('int32' for _ in range(nd))
2553        acc_dtype = getattr(self, 'acc_dtype', None)
2554        if acc_dtype is None:
2555            acc_dtype = node.outputs[0].type.dtype
2556        return [Kernel(code=src, name="reduk", params=params,
2557                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
2558                                              acc_dtype,
2559                                              node.outputs[0].type.dtype),
2560                       objvar='k_reduk_' + name)]
2561
2562    def c_code(self, node, name, inp, out, sub):
2563        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
2564            # We special case the no-reduction case since the gpu
2565            # kernel has trouble handling it.
2566            return """
2567        Py_XDECREF(%(out)s);
2568        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
2569        if (!%(out)s) {
2570            %(fail)s
2571        }
2572
2573        """ % dict(out=out[0], inp=inp[0], fail=sub['fail'])
2574        k = self.get_kernel_cache(node)
2575        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
2576                                            node.inputs[0].ndim)
2577        if self.axis is None:
2578            redux = [True] * node.inputs[0].ndim
2579        else:
2580            redux = self.redux
2581        acc_dtype = getattr(self, 'acc_dtype', None)
2582        if acc_dtype is None:
2583            acc_dtype = node.outputs[0].type.dtype
2584        input = inp[0]
2585        output = out[0]
2586        nd_out = node.outputs[0].ndim
2587        code = """
2588        size_t gs = 1;
2589        size_t ls;
2590        unsigned int n = 1;
2591        unsigned int proxy_dim[%(nd_in)s];
2592        unsigned int proxy_off;
2593        int proxy_str[%(nd_in)s];
2594        void *args[%(n_args)s];
2595        PyGpuArrayObject *tmp;
2596        int err;
2597""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)
2598
2599        if nd_out != 0:
2600            code += """
2601        size_t out_dims[%(nd_out)s];
2602        int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
2603""" % dict(nd_out=nd_out, output=output)
2604            j = 0
2605            for i in range(node.inputs[0].ndim):
2606                if not self.redux[i]:
2607                    code += """
2608         out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
2609         if (!need_out)
2610             need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
2611""" % dict(j=j, i=i, input=input, output=output)
2612                    j += 1
2613            code += """
2614         if (need_out) {
2615             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
2616             if (!%(output)s) {
2617                 %(fail)s
2618             }
2619         }
2620        """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
2621                   ctx=sub['params'],
2622                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
2623        else:
2624            code += """
2625        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
2626            Py_XDECREF(%(output)s);
2627            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
2628                                     %(ctx)s, Py_None);
2629            if (!%(output)s) {
2630                %(fail)s
2631            }
2632        }
2633        """ % dict(output=output, fail=sub['fail'], ctx=sub['params'],
2634                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
2635
2636        if acc_dtype != node.outputs[0].type.dtype:
2637            code += """
2638        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
2639                          %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
2640        if (!tmp) %(fail)s
2641        """ % dict(output=output, fail=sub['fail'], ctx=sub['params'],
2642                   acc_type=dtype_to_typecode(acc_dtype))
2643        else:
2644            code += """
2645        tmp = %(output)s;
2646        Py_INCREF(tmp);
2647        """ % dict(output=output)
2648
2649        # We need the proxies since we are passing a pointer to the
2650        # data into the call and therefore we need a real copy of the
2651        # data in the proper type.
2652        code += """
2653        args[0] = &n;
2654        args[1] = tmp->ga.data;
2655        args[2] = &tmp->ga.offset;
2656        """ % dict(output=output)
2657
2658        p = 3
2659        for i in range(node.inputs[0].ndim):
2660            code += """
2661        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
2662        args[%(p)s] = &proxy_dim[%(i)s];
2663        n *= %(input)s->ga.dimensions[%(i)s];
2664        """ % dict(i=i, p=p, input=input)
2665            p += 1
2666            if not redux[i]:
2667                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
2668
2669        code += """
2670        args[%(p)s] = %(input)s->ga.data;
2671        proxy_off = %(input)s->ga.offset;
2672        args[%(p)s+1] = &proxy_off;
2673        """ % dict(p=p, input=input)
2674        p += 2
2675
2676        for i in range(node.inputs[0].ndim):
2677            code += """
2678        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
2679        args[%(p)s] = &proxy_str[%(i)s];
2680        """ % dict(p=p, i=i, input=input)
2681            p += 1
2682
2683        code += """
2684        if (gs == 0) gs = 1;
2685        n /= gs;
2686        ls = %(ls)s;
2687        err = GpuKernel_call(&%(k_var)s, 1, &gs, &ls, 0, args);
2688        if (err != GA_NO_ERROR) {
2689            PyErr_Format(PyExc_RuntimeError,
2690                         "gpuarray error: GpuCAReduceCPY: %%s.",
2691                         GpuKernel_error(&%(k_var)s, err));
2692            %(fail)s
2693        }
2694
2695        if (%(cast_out)d) {
2696            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
2697            Py_XDECREF(tmp);
2698            if (err != GA_NO_ERROR) {
2699                PyErr_Format(PyExc_RuntimeError,
2700                             "gpuarray error: GpuCAReduceCPY [cast]: %%s.",
2701                             GpuArray_error(&tmp->ga, err));
2702                %(fail)s
2703            }
2704        } else {
2705            Py_XDECREF(%(output)s);
2706            %(output)s = tmp;
2707        }
2708
2709        """ % dict(k_var='k_reduk_' + name,
2710                   ls=ls, fail=sub['fail'], output=output, input=input,
2711                   cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
2712
2713        return code
2714
2715    def c_code_cache_version_apply(self, node):
2716        return (4, self.kernel_version(node))
2717
2718    def generate_kernel(self, node, odtype, redux):
2719        if isinstance(self.scalar_op, scalar.basic.Add):
2720            reduce_expr = "a + b"
2721        elif isinstance(self.scalar_op, scalar.basic.Mul):
2722            reduce_expr = "a * b"
2723        else:
2724            raise NotImplementedError()
2725        return ReductionKernel(node.inputs[0].type.context, odtype,
2726                               self.scalar_op.identity, reduce_expr, redux,
2727                               arguments=[make_argument(node.inputs[0], 'a')],
2728                               init_nd=node.inputs[0].ndim)
2729
2730    def perform(self, node, inp, out, ctx):
2731        input, = inp
2732        output, = out
2733
2734        if self.axis is None:
2735            redux = [True] * input.ndim
2736        else:
2737            redux = self.redux
2738
2739        if any(redux):
2740            output[0] = self.get_kernel_cache(node)(input).astype(
2741                copy=False, dtype=node.outputs[0].type.dtype)
2742        else:
2743            output[0] = pygpu.gpuarray.array(input, copy=True,
2744                                             dtype=node.outputs[0].type.dtype,
2745                                             context=ctx)
2746# To allow reloading old pickled files
2747GpuCAReduce = GpuCAReduceCPY
2748