1from __future__ import absolute_import, print_function, division
2import os
3import copy
4import re
5import numpy as np
6
7import theano
8from theano import Op, Apply, Type, Variable
9from theano import tensor, config
10from theano.gradient import grad_undefined
11from theano.scalar import (bool as bool_t,
12                           int32 as int32_t)
13from theano.tensor.basic import (
14    Alloc, AllocEmpty, alloc_validate_shape, Join, Split)
15
16from theano.gof import HideC, COp, ParamsType
17from theano.gof.utils import MethodNotDefined
18from theano.gof.opt import copy_stack_trace
19
20from collections import deque
21
22from six import string_types
23from six.moves import xrange
24from six import iteritems
25
26try:
27    import pygpu
28    from pygpu import gpuarray
29except ImportError:
30    pass
31
32from .type import (GpuArrayType, GpuArrayConstant, gpu_context_type,
33                   get_context, ContextNotDefined, EQ_MAP)
34from .fp16_help import write_w
35
36
37def as_gpuarray_variable(x, context_name):
38    """
39    This will attempt to convert `x` into a variable on the GPU.
40
41    It can take either a value of another variable.  If `x` is already
42    suitable, it will be returned as-is.
43
44    Parameters
45    ----------
46    x
47        Object to convert
48    context_name : str or None
49        target context name for the result
50
51    """
52    # If this is already some form of variable, try to avoid an extra transfer
53    if isinstance(x, Variable):
54        while True:
55            # If we are already a GpuArrayVariable in the right context
56            # then there is nothing to do.
57            if (isinstance(x.type, GpuArrayType) and
58                    x.type.context_name == context_name):
59                return x
60
61            # If x is the result of a transfer, try to dig through.
62            if getattr(x, 'owner', None):
63                if isinstance(x.owner.op, HostFromGpu):
64                    x = x.owner.inputs[0]
65                    continue
66                if isinstance(x.owner.op, GpuFromHost):
67                    x = x.owner.inputs[0]
68                    continue
69                if isinstance(x.owner.op, GpuToGpu):
70                    x = x.owner.inputs[0]
71                    continue
72
73            # If none of the conditions where met, then continue with
74            # the rest of the body
75            break
76
77        # If we couldn't deal with transfers, then maybe it's a tensor
78        if isinstance(x.type, tensor.TensorType):
79            return copy_stack_trace(x, GpuFromHost(context_name)(x))
80
81    # Try _as_GpuArrayVariable if possible
82    if hasattr(x, '_as_GpuArrayVariable'):
83        return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))
84
85    # If it didn't work try for a constant
86    ctx = get_context(context_name)
87
88    if isinstance(x, gpuarray.GpuArray):
89        if x.context.ptr != ctx.ptr:
90            x = x.transfer(ctx)
91
92    x = gpuarray.asarray(x, context=ctx)
93
94    bcast = [(s == 1) for s in x.shape]
95    return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
96                                         broadcastable=bcast,
97                                         context_name=context_name),
98                            x)
99
100
101def infer_context_name(*vars):
102    """
103    Infer the context name to use from the inputs given
104
105    """
106    # We try to infer the closest context first
107    # TODO: What to do in case of context conflicts?
108    #       We currently use a first found wins approach.
109    todo = deque()
110    todo.extendleft(vars)
111    while todo:
112        v = todo.pop()
113        if isinstance(v.type, GpuArrayType):
114            return v.type.context_name
115        if hasattr(v.tag, 'context_name'):
116            return v.tag.context_name
117        if v.owner:
118            if isinstance(v.owner.op, HostFromGpu):
119                return v.owner.inputs[0].type.context_name
120            if len(v.owner.inputs) == 1:
121                todo.extendleft(v.owner.inputs)
122    # If we can't find a context try None if it exists
123    try:
124        get_context(None)
125        return None
126    except ContextNotDefined:
127        raise ValueError("Could not infer context from inputs")
128
129
130def gpuarray_helper_inc_dir():
131    return os.path.join(os.path.dirname(__file__), 'c_code')
132
133
134class Kernel(object):
135    """
136    This class groups together all the attributes of a gpu kernel.
137
138    `params` should contain the data type for each argument.  Buffer
139    arguments should use the GpuArray class as the data type and
140    scalar should use their equivalent numpy dtype.  For ga_size and
141    ga_ssize, use gpuarray.SIZE and gpuarray.SSIZE.
142
143    If the `ctypes` flags is set to `True` then it should be a C
144    string which represent the typecode to use.
145
146    `flags` can contain the following keys whose values are booleans:
147
148        have_double
149            the kernel uses double-typed variables somewhere
150        have_small
151            the kernel uses variables whose type takes less than 4
152            bytes somewhere
153        have_complex
154            the kernel uses complex values somewhere
155        have_half
156            the kernel uses half-floats somewhere
157        ctypes
158            the `params` list consists of C typecodes
159
160    It can also have the key `cflags` which is a string of C flag
161    values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
162
163    Parameters
164    ----------
165    code: str
166        The source code of the kernel.
167    params: list
168        list of parameter types.
169    name: str
170        the name of the kernel function in the source.
171    flags: dict
172        dictionary of flags
173    codevar: str
174        the name of the variable for the code object.
175        (defaults to `kcode_` + name)
176    objvar: str
177        the name of the variable for the kernel object.
178        (defaults to `k_` + name)
179    fname: str
180        the name of the function wrapper.
181        (defaults to name + `_call`)
182    sname: str
183        the name of the scheduled call function
184        (defaults to name _ `_scall`)
185
186    """
187
188    def __init__(self, code, params, name, flags,
189                 codevar=None, objvar=None, fname=None, sname=None):
190        self.code = code
191        self.params = params
192        self.name = name
193        self.flags = flags
194        if codevar is None:
195            codevar = 'kcode_' + name
196        self.codevar = codevar
197        if objvar is None:
198            objvar = 'k_' + name
199        self.objvar = objvar
200        if fname is None:
201            fname = name + '_call'
202        self.fname = fname
203        if sname is None:
204            sname = name + '_scall'
205        self.sname = sname
206
207    @staticmethod
208    def get_flags(*types):
209        def get_dtype(t):
210            if isinstance(t, string_types):
211                return np.dtype(t)
212            elif isinstance(t, Type):
213                return t.dtype
214            elif isinstance(t, Variable):
215                return t.type.dtype
216            else:
217                raise TypeError("can't get a dtype from %s" % (type(t),))
218        dtypes = [get_dtype(t) for t in types]
219        flags = dict()
220        if any(d == np.float64 for d in dtypes):
221            flags['have_double'] = True
222        if any(d.itemsize < 4 for d in dtypes):
223            flags['have_small'] = True
224        if any(d.kind == 'c' for d in dtypes):
225            flags['have_complex'] = True
226        if any(d == np.float16 for d in dtypes):
227            flags['have_half'] = True
228        return flags
229
230    def _get_c_flags(self):
231        res = []
232        if self.flags.get('cflags', '') != '':
233            res.append(self.flags['cflags'])
234        if self.flags.get('have_double', False):
235            res.append('GA_USE_DOUBLE')
236        if self.flags.get('have_small', False):
237            res.append('GA_USE_SMALL')
238        if self.flags.get('have_complex', False):
239            res.append('GA_USE_COMPLEX')
240        if self.flags.get('have_half', False):
241            res.append('GA_USE_HALF')
242        res = '|'.join(res)
243        if not res:
244            return '0'
245        return res
246
247    def _get_py_flags(self):
248        res = dict(self.flags)
249        cflags = res.pop('cflags', '')
250        for fl in cflags.split('|'):
251            fl = fl.strip()
252            if fl == 'GA_USE_DOUBLE':
253                res['have_double'] = True
254            if fl == 'GA_USE_SMALL':
255                res['have_small'] = True
256            if fl == 'GA_USE_COMPLEX':
257                res['have_complex'] = True
258            if fl == 'GA_USE_HALF':
259                res['have_half'] = True
260        return res
261
262    def _get_c_types(self):
263        def m(t):
264            if t == gpuarray.GpuArray:
265                return "GA_BUFFER"
266            else:
267                return str(gpuarray.dtype_to_typecode(t))
268        return ', '.join(m(t) for t in self.params)
269
270
271def get_ctype(dtype):
272    if dtype is gpuarray.GpuArray:
273        return "gpudata *"
274    elif isinstance(dtype, np.dtype):
275        return 'npy_' + dtype.name
276    elif dtype == gpuarray.SIZE:
277        return "size_t"
278    elif dtype == gpuarray.SSIZE:
279        return "ssize_t"
280    else:
281        dtype = np.dtype(dtype)
282        return 'npy_' + dtype.name
283
284
285class GpuKernelBase(object):
286    """
287    Base class for operations that need to compile kernels.
288
289    It is not mandatory to use this class, but it helps with a lot of
290    the small things that you have to pay attention to.
291
292    """
293    params_type = gpu_context_type
294
295    def get_params(self, node):
296        # Default implementation, suitable for most sub-classes.
297        # To be necessarly overridden in a subclass that uses a ParamsType.
298        assert (self.params_type is gpu_context_type and
299                node.inputs and
300                isinstance(node.inputs[0].type, GpuArrayType))
301        return node.inputs[0].type.context
302
303    def get_gpu_context(self, node):
304        # Private method used to retrieve GPU context, instead of
305        # directly using self.get_params(node), as this latter may be overridden.
306        if isinstance(self.params_type, ParamsType) and self.params_type.has_type(gpu_context_type):
307            # Get field name of gpu_context_type into ParamsType object.
308            gpu_context_field = self.params_type.get_field(gpu_context_type)
309            # Get Params object (self.get_params() should have been overridden).
310            wrap = self.get_params(node)
311            # Get GPU context from Params object.
312            return getattr(wrap, gpu_context_field)
313        assert self.params_type is gpu_context_type
314        return self.get_params(node)
315
316    def get_gpu_context_c_name(self, params_c_name):
317        # Private method used to retrieve C name of GPU context variable,
318        # instead of directly using sub['params'], as params may not be a GPU context
319        # (e.g. for sub-classes that use ParamsType).
320        if isinstance(self.params_type, ParamsType) and self.params_type.has_type(gpu_context_type):
321            return "(%s->%s)" % (params_c_name, self.params_type.get_field(gpu_context_type))
322        assert self.params_type is gpu_context_type
323        return params_c_name
324
325    def gpu_kernels(self, node, name):
326        """
327        This is the method to override. This should return an iterable
328        of Kernel objects that describe the kernels this op will need.
329
330        """
331        raise MethodNotDefined('gpu_kernels')
332
333    def c_headers(self):
334        try:
335            o = super(GpuKernelBase, self).c_headers()
336        except MethodNotDefined:
337            o = []
338        return o + ['gpuarray/types.h', 'numpy/npy_common.h']
339
340    def c_header_dirs(self):
341        try:
342            o = super(GpuKernelBase, self).c_header_dirs()
343        except MethodNotDefined:
344            o = []
345        # We rely on the input types for the directory to gpuarray includes
346        return o + [np.get_include()]
347
348    def _generate_kernel_code(self, k):
349        code = '\\n'.join(l for l in k.code.split('\n'))
350        code = code.replace('"', '\\"')
351        return ("""static const char *%(cname)s_unsigned = "%(code)s";
352                static const char *%(cname)s = (char *)%(cname)s_unsigned;
353                """ %
354                dict(cname=k.codevar, code=code))
355
356    def _generate_kernel_vars(self, k):
357        return """GpuKernel %(kname)s;""" % dict(kname=k.objvar)
358
359    def _generate_kernel_wrap(self, k):
360        args = []
361        setargs = []
362        for i, p in enumerate(k.params):
363            args.append("{} arg{}".format(get_ctype(p), i))
364            if p is gpuarray.GpuArray:
365                setarg = "GpuKernel_setarg(&{0}, {1}, arg{1});"
366            else:
367                setarg = "GpuKernel_setarg(&{0}, {1}, &arg{1});"
368            setargs.append(setarg.format(k.objvar, i))
369
370        args = ', '.join(args)
371        setargs = '\n  '.join(setargs)
372
373        return """
374int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
375                  {args}) {{
376  {setargs}
377
378  return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL);
379}}
380
381int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
382  size_t _gs = 0;
383  size_t _ls = 0;
384  int _err;
385
386  if (_nd != 1) return GA_UNSUPPORTED_ERROR;
387
388  _err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls);
389  if (_err != GA_NO_ERROR)
390    return _err;
391
392  {setargs}
393
394  return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL);
395}}
396        """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
397                   kname=k.objvar)
398
399    def c_support_code_apply(self, node, name):
400        kernels = self.gpu_kernels(node, name)
401        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
402        return codes
403
404    def c_support_code_struct(self, node, name):
405        kernels = self.gpu_kernels(node, name)
406        kvars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
407        wrappers = '\n'.join(self._generate_kernel_wrap(k) for k in kernels)
408        return kvars + '\n' + wrappers
409
410    def _generate_zeros(self, k):
411        return """memset(&%(v)s, 0, sizeof(%(v)s));""" % dict(v=k.objvar)
412
413    def _generate_kernel_init(self, k, fail, ctx):
414        return """{
415  int err;
416  int types[%(numargs)u] = {%(types)s};
417  if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1,
418                            &%(cname)s, NULL, "%(kname)s", %(numargs)u,
419                            types, %(flags)s, NULL)) != GA_NO_ERROR) {
420    PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
421                 err, gpucontext_error(%(ctx)s->ctx, err));
422    %(fail)s
423  }
424}""" % dict(numargs=len(k.params), types=k._get_c_types(),
425            ovar=k.objvar, kname=k.name, cname=k.codevar,
426            flags=k._get_c_flags(), fail=fail, ctx=ctx)
427
428    def c_init_code_struct(self, node, name, sub):
429        ctx = self.get_gpu_context_c_name(sub['params'])
430        kernels = self.gpu_kernels(node, name)
431        inits_0 = '\n'.join(self._generate_zeros(k) for k in kernels)
432        inits = '\n'.join(self._generate_kernel_init(k, sub['fail'], ctx)
433                          for k in kernels)
434        return '\n'.join([inits_0, inits])
435
436    def _generate_kernel_cleanup(self, k):
437        return "GpuKernel_clear(&%(ovar)s);" % dict(ovar=k.objvar)
438
439    def c_cleanup_code_struct(self, node, name):
440        kernels = self.gpu_kernels(node, name)
441        cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels)
442        return cleanups
443
444    # This is a shorthand for if your op only has a fixed version
445    # You can reimplement it, but make sure to call kernel_version()
446    def c_code_cache_version_apply(self, node):
447        v = self.c_code_cache_version()
448        if not v:
449            return ()
450        return (v, self.kernel_version(node))
451
452    def kernel_version(self, node):
453        """
454        If you override :meth:`c_code_cache_version_apply`, call this
455        method to have the version of the kernel support code.
456
457        Parameters
458        ----------
459        node : apply node
460            The node that we need the cache version for.
461
462        """
463        return (9,)
464
465
466def forward_string_meth(name):
467    def f(*args):
468        res = getattr(GpuKernelBase, name)(*args)
469        try:
470            res = res + '\n' + getattr(COp, name)(*args)
471        except MethodNotDefined:
472            pass
473        return res
474    f.__name__ = name
475    return f
476
477
478def get_dtype(s):
479    if s == '*':
480        return gpuarray.GpuArray
481    if s == 'size':
482        return gpuarray.SIZE
483    if s == 'ssize':
484        return gpuarray.SSIZE
485    else:
486        return np.dtype(s)
487
488
489class CGpuKernelBase(COp, GpuKernelBase):
490    """
491    Class to combine GpuKernelBase and COp.
492
493    It adds a new section type 'kernels' where you can define kernels
494    with the '#kernel' tag
495    """
496    SECTIONS = copy.copy(COp.SECTIONS)
497    SECTIONS.add('kernels')
498
499    kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE)
500
501    get_params = GpuKernelBase.get_params
502    c_support_code_apply = forward_string_meth('c_support_code_apply')
503    c_support_code_struct = forward_string_meth('c_support_code_struct')
504    c_init_code_struct = forward_string_meth('c_init_code_struct')
505    c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct')
506
507    def c_code_cache_version_apply(self, node):
508        return GpuKernelBase.c_code_cache_version_apply(self, node)
509
510    def _type_macros(self, node):
511        define_template = "#define %s %s\n"
512        undef_template = "#undef %s\n"
513        define_macros = []
514        undef_macros = []
515        for i, v in enumerate(node.inputs):
516            if isinstance(v.type, GpuArrayType):
517                macro_name = "DTYPE_INPUT_%d" % (i,)
518                macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
519                define_macros.append(
520                    define_template %
521                    (macro_name, macro_value))
522                undef_macros.append(undef_template % macro_name)
523        for i, v in enumerate(node.outputs):
524            if isinstance(v.type, GpuArrayType):
525                macro_name = "DTYPE_OUTPUT_%d" % (i,)
526                macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
527                define_macros.append(
528                    define_template %
529                    (macro_name, macro_value))
530                undef_macros.append(undef_template % macro_name)
531
532        return ''.join(define_macros), ''.join(undef_macros)
533
534    def gpu_kernels(self, node, name):
535        if hasattr(self, '_cached_kernels'):
536            return self._cached_kernels
537        if 'kernels' in self.code_sections:
538            code = self.code_sections['kernels']
539            split = self.kernel_re.split(code)
540            if split[0].strip() != '':
541                raise ValueError("Stray code in kernels section before the "
542                                 "first #kernel statement.")
543            def_macros, undef_macros = self._type_macros(node)
544            n = 1
545            res = []
546            while n < len(split):
547                kspec = split[n]
548                kcode = split[n + 1]
549                splt2 = kspec.split(':')
550                if len(splt2) != 3:
551                    raise ValueError("Bad kernel spec: %s" % (kspec,))
552                kname = splt2[0].strip()
553                ktypes = [get_dtype(s.strip()) for s in splt2[1].split(',')]
554                kflags = splt2[2].strip()
555                kcode = def_macros + '\n' + kcode + '\n' + undef_macros
556                res.append(Kernel(kcode, ktypes, kname,
557                                  flags=dict(cflags=kflags)))
558                n += 2
559            self._cached_kernels = res
560            return res
561        else:
562            return GpuKernelBase.gpu_kernels(self, node, name)
563
564
565class HostFromGpu(Op):
566    """
567    Transfer data to CPU.
568
569    """
570    __props__ = ()
571    _f16_ok = True
572
573    def __str__(self):
574        return 'HostFromGpu(gpuarray)'
575
576    def make_node(self, x):
577        if not isinstance(x.type, GpuArrayType):
578            raise TypeError(x)
579        out_var = tensor.TensorType(dtype=x.dtype,
580                                    broadcastable=x.broadcastable)()
581        # Keep the special comparison if there is one.
582        values_eq_approx = getattr(x.tag, 'values_eq_approx', None)
583        if values_eq_approx:
584            out_var.tag.values_eq_approx = EQ_MAP.get(values_eq_approx,
585                                                      values_eq_approx)
586        return Apply(self, [x], [out_var])
587
588    def perform(self, node, inp, out):
589        x, = inp
590        z, = out
591        z[0] = np.asarray(x)
592
593    def c_code(self, node, name, inputs, outputs, sub):
594        return """
595        GpuArray %(name)s_ga_s;
596        GpuArray *%(name)s_ga = NULL;
597        int %(name)serr;
598        PyArray_Descr *%(name)s_dtype;
599        if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
600            if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
601                PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
602                %(fail)s;
603            }
604            %(name)s_ga = &%(name)s_ga_s;
605        } else {
606            %(name)s_ga = &%(inp)s->ga;
607        }
608        %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
609        Py_XDECREF(%(out)s);
610        // PyArray_Empty below steals a reference to the dtype we pass it
611        // so we need an extra one to spare.
612        Py_INCREF(%(name)s_dtype);
613        %(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
614                                (npy_intp *)%(inp)s->ga.dimensions,
615                                %(name)s_dtype,
616                                (%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
617                                !(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
618        if (%(out)s == NULL) {
619            if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
620            %(fail)s
621        }
622        Py_BEGIN_ALLOW_THREADS
623        %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
624                                    PyArray_NBYTES(%(out)s),
625                                    %(name)s_ga);
626        Py_END_ALLOW_THREADS
627        if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
628        if (%(name)serr != GA_NO_ERROR) {
629            PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
630            %(fail)s
631        }
632        """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
633               'out': outputs[0]}
634
635    def c_code_cache_version(self):
636        return (2,)
637
638    def grad(self, inputs, grads):
639        gz, = grads
640        return [GpuFromHost(inputs[0].type.context_name)(gz)]
641
642    def R_op(self, inputs, eval_points):
643        ev, = eval_points
644        return [self(ev)]
645
646    def infer_shape(self, node, xshp):
647        return xshp
648
649host_from_gpu = HostFromGpu()
650
651
652class GpuFromHost(Op):
653    """
654    Transfer data to GPU.
655
656    """
657    __props__ = ('context_name',)
658    _f16_ok = True
659    params_type = gpu_context_type
660
661    def __init__(self, context_name):
662        self.context_name = context_name
663
664    def __str__(self):
665        return 'GpuFromHost<%s>' % (self.context_name,)
666
667    def make_node(self, x):
668        if not isinstance(x.type, tensor.TensorType):
669            raise TypeError(x)
670        if "complex" in x.dtype:
671            raise TypeError("complex not supported in the new gpuarray back-end.", x)
672        out_var = GpuArrayType(broadcastable=x.broadcastable,
673                               context_name=self.context_name,
674                               dtype=x.dtype)()
675        # Keep the special comparison if there is one.
676        values_eq_approx = getattr(x.tag, 'values_eq_approx', None)
677        if values_eq_approx:
678            out_var.tag.values_eq_approx = EQ_MAP.get(values_eq_approx,
679                                                      values_eq_approx)
680        return Apply(self, [x], [out_var])
681
682    def get_params(self, node):
683        return get_context(self.context_name)
684
685    def perform(self, node, inp, out, ctx):
686        x, = inp
687        z, = out
688        z[0] = gpuarray.array(x, context=ctx)
689
690    def grad(self, inputs, grads):
691        gz, = grads
692        return [as_gpuarray_variable(
693                gz, context_name=self.context_name).transfer('cpu')]
694
695    def R_op(self, inputs, eval_points):
696        ev, = eval_points
697        return [self(ev)]
698
699    def infer_shape(self, node, xshp):
700        return xshp
701
702    def c_headers(self):
703        return ["gpuarray_helper.h"]
704
705    def c_header_dirs(self):
706        return [gpuarray_helper_inc_dir()]
707
708    def c_code(self, node, name, inputs, outputs, sub):
709        return """
710        PyArrayObject *%(name)s_tmp;
711        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
712        int err;
713        if (%(name)s_tmp == NULL)
714          %(fail)s
715
716        if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
717            !theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
718                               (size_t *)PyArray_DIMS(%(name)s_tmp),
719                               get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
720          Py_XDECREF(%(out)s);
721          %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
722                                (size_t *)PyArray_DIMS(%(name)s_tmp),
723                                get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
724                                GA_C_ORDER, %(ctx)s, Py_None);
725          if (%(out)s == NULL) {
726            Py_DECREF(%(name)s_tmp);
727            %(fail)s;
728          }
729        }
730
731        Py_BEGIN_ALLOW_THREADS
732        err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
733                             PyArray_NBYTES(%(name)s_tmp));
734        Py_END_ALLOW_THREADS
735        Py_DECREF(%(name)s_tmp);
736        if (err != GA_NO_ERROR) {
737          PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
738          %(fail)s;
739        }
740        """ % {'name': name, 'inp': inputs[0], 'ctx': sub['params'],
741               'out': outputs[0], 'fail': sub['fail']}
742
743    def c_code_cache_version(self):
744        return (10,)
745
746
747class GpuToGpu(Op):
748    """
749    Transfer data between GPUs.
750
751    """
752    __props__ = ('context_name',)
753    _f16_ok = True
754    params_type = gpu_context_type
755
756    def __init__(self, context_name):
757        self.context_name = context_name
758
759    def __str__(self):
760        return 'GpuToGpu<%s>' % (self.context_name,)
761
762    def make_node(self, x):
763        if not isinstance(x.type, GpuArrayType):
764            raise TypeError(x)
765        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
766                                              context_name=self.context_name,
767                                              dtype=x.dtype)()])
768
769    def get_params(self, node):
770        return get_context(self.context_name)
771
772    def perform(self, node, inp, out, ctx):
773        x, = inp
774        z, = out
775        z[0] = x.transfer(ctx)
776
777    def grad(self, inputs, grads):
778        gz, = grads
779        return [GpuToGpu(inputs[0].type.context_name)(gz)]
780
781    def R_op(self, inputs, eval_points):
782        return self(eval_points[0])
783
784    def infer_shape(self, node, xshp):
785        return xshp
786
787    def c_code(self, node, name, inputs, outputs, sub):
788        return """
789        Py_XDECREF(%(out)s);
790        %(out)s = pygpu_empty(%(inp)s->ga.nd,
791                              %(inp)s->ga.dimensions,
792                              %(inp)s->ga.typecode,
793                              GpuArray_IS_C_CONTIGUOUS(&(%(inp)s->ga)) ? GA_C_ORDER:GA_F_ORDER,
794                              %(ctx)s, Py_None);
795        if (%(out)s == NULL) {
796            %(fail)s
797        }
798        if (pygpu_transfer(%(out)s, %(inp)s)) {
799            %(fail)s
800        }
801        """ % {'inp': inputs[0], 'ctx': sub['params'],
802               'out': outputs[0], 'fail': sub['fail']}
803
804    def c_code_cache_version(self):
805        return (1,)
806
807
808class GpuAlloc(HideC, Alloc):
809    """
810    Allocate initialized memory on the GPU.
811
812    Parameters
813    ----------
814    context_name : str
815        The name of the context in which to allocate memory
816    memset_0 : bool
817        It's only an optimized version. True, it means the
818        value is always 0, so the c code call memset as it is faster.
819
820    """
821
822    __props__ = ('memset_0', 'context_name')
823    _f16_ok = True
824    params_type = ParamsType(context=gpu_context_type, memset_0=bool_t)
825
826    def __init__(self, context_name, memset_0=False):
827        self.context_name = context_name
828        self.memset_0 = memset_0
829
830    def get_params(self, node):
831        return self.params_type.get_params(context=get_context(self.context_name),
832                                           memset_0=self.memset_0)
833
834    def __str__(self):
835        # Hide the memset parameter when not used to prevent confusion.
836        if self.memset_0:
837            m = "{memset_0=True}"
838        else:
839            m = ""
840        return "%s<%s>%s" % (self.__class__.__name__, self.context_name, m)
841
842    def make_node(self, value, *shape):
843        value = as_gpuarray_variable(value, context_name=self.context_name)
844        sh, bcast = alloc_validate_shape(shape)
845        if value.ndim > len(sh):
846            TypeError("The GpuAlloc value to use has more dimensions "
847                      "than the specified shape", value.ndim, len(sh))
848        otype = value.type.clone(broadcastable=bcast)
849        return Apply(self, [value] + sh, [otype()])
850
851    def c_headers(self):
852        return ['<numpy_compat.h>']
853
854    def perform(self, node, inputs, outs, params):
855        out, = outs
856        v = inputs[0]
857        sh = tuple(map(int, inputs[1:]))
858        if out[0] is None or out[0].shape != sh:
859            if self.memset_0:
860                out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=params.context)
861            else:
862                out[0] = gpuarray.empty(sh, dtype=v.dtype, context=params.context)
863                out[0][...] = v
864        else:
865            out[0][...] = v
866
867    def c_code(self, node, name, inp, out, sub):
868        vv = inp[0]
869        ndim = len(inp[1:])
870        zz, = out
871
872        code = """
873        int i;
874        size_t %(name)s_shape[%(ndim)s];
875        """ % dict(name=name, ndim=ndim)
876
877        for i, shp_i in enumerate(inp[1:]):
878            code += """
879        %(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
880        """ % dict(name=name, i=i, shp_i=shp_i)
881
882        code += """
883        int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
884
885        if (!need_new_out)
886            for (i = 0; i < %(ndim)s; i++)
887                need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
888
889        if (need_new_out && (%(params)s->memset_0)) {
890            //pygpu_zeros can be faster then empty followed by memset.
891            Py_XDECREF(%(zz)s);
892            %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
893                                 %(vv)s->ga.typecode, GA_C_ORDER,
894                                 %(params)s->context, Py_None);
895            if (!%(zz)s) {
896                %(fail)s
897            }
898        } else {
899            if (need_new_out) {
900                Py_XDECREF(%(zz)s);
901                %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
902                                     %(vv)s->ga.typecode, GA_C_ORDER,
903                                     %(params)s->context, Py_None);
904                if (!%(zz)s) {
905                    %(fail)s
906                }
907            }
908            if (%(params)s->memset_0 && GpuArray_ISONESEGMENT(&%(zz)s->ga))
909            {
910                int err = GpuArray_memset(&%(zz)s->ga, 0);
911                if (err != GA_NO_ERROR)
912                {
913                    PyErr_Format(PyExc_MemoryError,
914                                 "GpuAlloc: Error memsetting %%llu"
915                                 " element of device memory to 0.",
916                                 (unsigned long long)PyGpuArray_SIZE(%(zz)s));
917                    %(fail)s;
918                }
919            }
920            else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
921                     GA_NO_ERROR) {
922                PyErr_SetString(PyExc_ValueError, "setarray failed");
923                %(fail)s
924            }
925        }
926        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, params=sub['params'],
927                   fail=sub['fail'])
928
929        return code
930
931    def c_code_cache_version(self):
932        return (4,)
933
934    def do_constant_folding(self, node):
935        from . import subtensor, blas
936        for client in node.outputs[0].clients:
937            if client[0] == 'output':
938                # If the output is a constant, it will have to be deepcopied
939                # each time the function is called.  So we do not fold.
940                return False
941            # The following ops work inplace of their input id 0.
942            elif (client[1] == 0 and
943                  # Ops that will work inplace on the Alloc. So if they
944                  # get constant_folded, they would copy the
945                  # constant and this is less efficients.
946
947                  # Not doing the constant folding could also lower
948                  # the peak memory usage, as we the "constant" won't
949                  # always exists.
950                  isinstance(client[0].op,
951                             (subtensor.GpuIncSubtensor,
952                              subtensor.GpuAdvancedIncSubtensor1,
953                              subtensor.GpuAdvancedIncSubtensor1_dev20,
954                              subtensor.GpuAdvancedIncSubtensor,
955                              blas.GpuGemm, blas.GpuGemv,
956                              blas.GpuGer)
957                             )):
958                return False
959            # If the clients is a transfer, we don't want to fold. We
960            # let the moving opt finish before deciding what to do.
961            elif isinstance(client[0].op, HostFromGpu):
962                return False
963        return True
964
965
966class GpuAllocEmpty(HideC, AllocEmpty):
967    """
968    Allocate uninitialized memory on the GPU.
969
970    """
971    __props__ = ('dtype', 'context_name')
972    _f16_ok = True
973    params_type = ParamsType(context=gpu_context_type,
974                             typecode=int32_t)
975
976    def __init__(self, dtype, context_name):
977        self.dtype = dtype
978        self.context_name = context_name
979
980    @property
981    def typecode(self):
982        return gpuarray.dtype_to_typecode(self.dtype)
983
984    def get_params(self, node):
985        return self.params_type.get_params(context=get_context(self.context_name),
986                                           typecode=self.typecode)
987
988    def make_node(self, *shape):
989        sh, bcast = alloc_validate_shape(shape)
990        output = GpuArrayType(dtype=self.dtype, broadcastable=bcast,
991                              context_name=self.context_name)()
992        output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
993        # The outut can contain nan/inf.
994        output.type.filter_checks_isfinite = False
995        output.tag.nan_guard_mode_check = False
996        return Apply(self, sh, [output])
997
998    def debug_perform(self, node, inputs, out_, params):
999        self.perform(node, inputs, out_, params)
1000        out_[0][0][:] = -123456789
1001
1002    def perform(self, node, inputs, out_, params):
1003        out = out_[0]
1004        sh = [int(i) for i in inputs]
1005        if out[0] is None or out[0].shape != sh:
1006            out[0] = pygpu.empty(sh, dtype=self.dtype, context=params.context)
1007        # if out[0] is the right shape, we just return it
1008
1009    def c_headers(self):
1010        return ['<gpuarray_helper.h>']
1011
1012    def c_header_dirs(self):
1013        return [gpuarray_helper_inc_dir()]
1014
1015    def c_code(self, node, name, inp, out, sub):
1016        ndim = len(inp)
1017        zz = out[0]
1018        fail = sub['fail']
1019
1020        code = ["""
1021int i;
1022size_t shape[%(ndim)s];
1023""" % dict(ndim=ndim)]
1024
1025        for i, shp_i in enumerate(inp):
1026            code.append("""
1027shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
1028""" % dict(i=i, shp_i=shp_i))
1029
1030        code.append("""
1031if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(params)s->typecode, GA_C_ORDER,
1032                       %(params)s->context)) {
1033  %(fail)s
1034}
1035""" % dict(zz=zz, ndim=ndim, fail=fail, params=sub['params']))
1036
1037        return ''.join(code)
1038
1039    def c_code_cache_version(self):
1040        return (2,)
1041
1042    def do_constant_folding(self, node):
1043        return False
1044
1045    def infer_shape(self, node, input_shapes):
1046        return [node.inputs]
1047
1048    def grad(self, *args):
1049        # Don't reuse the grad implementation from Alloc
1050        raise NotImplementedError("grad disabled")
1051
1052
1053def empty_like(var):
1054    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
1055
1056
1057class GpuContiguous(Op):
1058    """
1059    Return a C contiguous version of the input.
1060
1061    This may either pass the object as-is (if already C contiguous) or
1062    make a copy.
1063
1064    """
1065    __props__ = ()
1066    view_map = {0: [0]}
1067    _f16_ok = True
1068
1069    def grad(self, inputs, dout):
1070        x, = inputs
1071        dout, = dout
1072        dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))
1073
1074        return [dout]
1075
1076    def make_node(self, input):
1077        input = as_gpuarray_variable(input,
1078                                     context_name=infer_context_name(input))
1079        return Apply(self, [input], [input.type()])
1080
1081    def c_header_dirs(self):
1082        return [gpuarray_helper_inc_dir()]
1083
1084    def c_headers(self):
1085        return ['<gpuarray_helper.h>']
1086
1087    def c_code_cache_version(self):
1088        return (4,)
1089
1090    def c_code(self, node, name, inp, out, sub):
1091        return """
1092        {
1093            if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))) {
1094                Py_XDECREF(%(z)s);
1095                %(z)s = %(input)s;
1096                Py_INCREF(%(z)s);
1097
1098            } else if (NULL == %(z)s
1099                || !theano_size_check(%(z)s, PyGpuArray_NDIM(%(input)s), PyGpuArray_DIMS(%(input)s),
1100                                      %(input)s->ga.typecode)
1101                || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
1102            {
1103                Py_XDECREF(%(z)s);
1104                %(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
1105                if (!%(z)s)
1106                {
1107                    %(fail)s;
1108                }
1109            } else if(pygpu_move(%(z)s, %(input)s) == -1) {
1110                %(fail)s;
1111            }
1112        }
1113        """ % dict(input=inp[0], z=out[0], fail=sub['fail'])
1114
1115    def perform(self, node, inp, out_):
1116        x, = inp
1117        out, = out_
1118        out[0] = pygpu.ascontiguousarray(x)
1119
1120gpu_contiguous = GpuContiguous()
1121
1122
1123class GpuReshape(HideC, tensor.Reshape):
1124    """
1125    Reshape for GPU variables.
1126
1127    """
1128
1129    _f16_ok = True
1130
1131    # __hash__, __eq__, __str__ come from tensor.Reshape
1132    def make_node(self, x, shp):
1133        ctx_name = infer_context_name(x)
1134        x = as_gpuarray_variable(x, context_name=ctx_name)
1135        shp = tensor.as_tensor_variable(shp)
1136        res = x.transfer('cpu').reshape(shp, ndim=self.ndim)
1137        otype = GpuArrayType(dtype=res.dtype,
1138                             broadcastable=res.broadcastable,
1139                             context_name=ctx_name)
1140        return Apply(self, [x, shp], [otype()])
1141
1142    def perform(self, node, inp, out_, params):
1143        x, shp = inp
1144        out, = out_
1145        if (len(shp) != self.ndim):
1146            raise ValueError('shape argument to GpuReshape.perform'
1147                             ' has incorrect length %i'
1148                             ', should be %i' % (len(shp), self.ndim), shp)
1149
1150        if shp.prod() != x.size:
1151            # We need to do check here to raise the same error as NumPy.
1152            # We should make pygpu do the same.
1153            ss = 1
1154            nb_m1 = 0
1155            for i in shp:
1156                if i == -1:
1157                    nb_m1 += 1
1158                else:
1159                    ss *= i
1160            if nb_m1 > 1:
1161                raise ValueError("Only one -1 is accepted in the new shape")
1162            elif nb_m1 == 1:
1163                if (x.size % ss) != 0:
1164                    raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
1165            else:
1166                raise ValueError("total size of new array must be unchanged")
1167        out[0] = x.reshape(tuple(shp))
1168
1169    def c_code_cache_version(self):
1170        return (3,)
1171
1172    def c_code(self, node, name, inputs, outputs, sub):
1173        x, shape = inputs
1174        output, = outputs
1175        sdtype = node.inputs[1].type.dtype_specs()[1]
1176        just_fail = sub['fail']
1177        fail = """{
1178        free(new_dims);
1179        %(just_fail)s
1180        }""" % dict(just_fail=just_fail)
1181        params = sub['params']
1182        return """
1183        size_t old_size = 1, new_size = 1;
1184        size_t* new_dims = NULL;
1185        int compute_axis = -1;
1186
1187        assert (PyArray_NDIM(%(shape)s) == 1);
1188        if (PyArray_DIM(%(shape)s, 0) != %(params)s->ndim)
1189        {
1190            PyErr_Format(PyExc_ValueError,
1191                         "GpuReshape: given shape is of incorrect "
1192                         "length (%%d should be %%d).",
1193                         PyArray_DIM(%(shape)s, 0), %(params)s->ndim);
1194            %(just_fail)s;
1195        }
1196
1197        new_dims = (size_t*) malloc(sizeof(size_t) * %(params)s->ndim);
1198        if (new_dims == NULL) {
1199            PyErr_NoMemory();
1200            %(just_fail)s
1201        }
1202
1203        for (size_t i = 0; i < %(x)s->ga.nd; ++i)
1204            old_size *= %(x)s->ga.dimensions[i];
1205
1206        for (size_t i = 0; i < %(params)s->ndim; ++i)
1207        {
1208            new_dims[i] = ((%(sdtype)s*)(
1209                    PyArray_BYTES(%(shape)s) +
1210                    i * PyArray_STRIDES(%(shape)s)[0]))[0];
1211            if (new_dims[i] == -1)
1212            {
1213                if (compute_axis != -1)
1214                {
1215                    PyErr_Format(PyExc_ValueError,
1216                                 "GpuReshape: only one -1 is accepted "
1217                                 "in the new shape, but got two at "
1218                                 "indices %%d and %%zu.",
1219                                 compute_axis, i);
1220                    %(fail)s;
1221                }
1222                compute_axis = i;
1223            }
1224            else
1225                new_size *= new_dims[i];
1226        }
1227
1228        if (compute_axis == -1 && new_size != old_size)
1229        {
1230            PyErr_Format(PyExc_ValueError,
1231                         "GpuReshape: trying to reshape an array of "
1232                         "total size %%zu into an array of total size "
1233                         "%%zu.", old_size, new_size);
1234            %(fail)s;
1235        }
1236        else if (compute_axis != -1 && old_size %% new_size != 0)
1237        {
1238            PyErr_Format(PyExc_ValueError,
1239                         "GpuReshape: -1 axis found at index %%d in "
1240                         "new shape but the total size of the array "
1241                         "(%%zu) is not divisible by the given shapes "
1242                         "(%%zu).", compute_axis, old_size, new_size);
1243            %(fail)s;
1244        }
1245
1246        Py_XDECREF(%(output)s);
1247        %(output)s = pygpu_reshape(%(x)s, %(params)s->ndim, new_dims,
1248                                   GA_C_ORDER, 0, compute_axis);
1249        free(new_dims);
1250        if (%(output)s == NULL)
1251        {
1252            %(just_fail)s;
1253        }
1254        """ % locals()
1255
1256
1257class GpuJoin(HideC, Join):
1258    """
1259    Join for GPU.
1260
1261    """
1262    _f16_ok = True
1263    __props__ = ("view",)
1264    params_type = gpu_context_type
1265
1266    def __init__(self, view=-1):
1267        self.view = view
1268        if view != -1:
1269            # since the first input is always the axis, the tensors
1270            # start from index 1.
1271            self.view_map = {0: [1 + view]}
1272
1273    def __str__(self):
1274        return Join.__str__(self)
1275
1276    def make_node(self, axis, *tensors):
1277        node = Join.make_node(self, axis, *tensors)
1278
1279        ctx_name = infer_context_name(*tensors)
1280
1281        def agv(v):
1282            return as_gpuarray_variable(v, context_name=ctx_name)
1283
1284        return Apply(self, [node.inputs[0]] + list(map(agv, tensors)),
1285                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
1286                                   dtype=node.outputs[0].dtype,
1287                                   context_name=ctx_name)()])
1288
1289    def get_params(self, node):
1290        return node.outputs[0].type.context
1291
1292    def perform(self, node, axis_and_tensors, out_, ctx):
1293        out, = out_
1294        view = self.view
1295        axis = int(axis_and_tensors[0])
1296        tensors = axis_and_tensors[1:]
1297
1298        if axis < -axis_and_tensors[1].ndim:
1299            raise IndexError
1300        if axis < 0:
1301            axis += axis_and_tensors[1].ndim
1302        # we check these tensors for being empty.
1303        if (view != -1) and np.all(
1304                [tensor.shape[axis] == 0 for tensor in
1305                 tensors[0:view] + tensors[view + 1:]]):
1306            out[0] = tensors[view]
1307        else:
1308            out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
1309                node.outputs[0].dtype)
1310
1311    def c_code_cache_version(self):
1312        return (3,)
1313
1314    def c_support_code(self):
1315        return """
1316        #if PY_MAJOR_VERSION >= 3
1317        #define PyInt_AsLong PyLong_AsLong
1318        #endif
1319        """
1320
1321    def c_headers(self):
1322        return ['<numpy_compat.h>']
1323
1324    def c_code(self, node, name, inputs, out_, sub):
1325        axis, tensors = inputs[0], inputs[1:]
1326        copy_to_list = []
1327        restype = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
1328        view = self.view
1329        non_empty_tensor = tensors[view]
1330        for i, inp in enumerate(tensors):
1331            copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
1332
1333        n = len(tensors)
1334        fail = sub['fail']
1335        out = out_[0]
1336        copy_inputs_to_list = '\n'.join(copy_to_list)
1337        ctx = sub['params']
1338
1339        code = """
1340        const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) *
1341                                                       %(n)s);
1342        if (als == NULL) {
1343            PyErr_NoMemory();
1344            %(fail)s
1345        }
1346        %(copy_inputs_to_list)s
1347        Py_XDECREF(%(out)s);
1348        {
1349            int axis = PyInt_AsLong((PyObject *)%(axis)s);
1350            if (axis < 0) {
1351                if (axis == -1 && PyErr_Occurred()) {
1352                    %(fail)s
1353                }
1354                axis += als[0]->nd;
1355                if (axis < 0) {
1356                    PyErr_SetString(PyExc_IndexError, "invalid axis");
1357                    %(fail)s
1358                }
1359            }
1360
1361            int tensors_lens_sum;
1362            if(%(view)s != -1) {
1363                tensors_lens_sum = 0;
1364                for(int i=0; i < %(n)s; i++){
1365                    tensors_lens_sum += als[i]->dimensions[axis];
1366                }
1367                tensors_lens_sum -= PyGpuArray_DIM(%(non_empty_tensor)s, axis);
1368            }
1369
1370            if(%(view)s != -1 && tensors_lens_sum == 0) {
1371                Py_INCREF(%(non_empty_tensor)s);
1372                %(out)s = %(non_empty_tensor)s;
1373            }else{
1374                %(out)s = pygpu_concatenate(als, %(n)s, axis,
1375                                            %(restype)s, (PyObject *)&PyGpuArrayType,
1376                                            %(ctx)s);
1377            }
1378        }
1379        PyMem_Free(als);
1380        if (%(out)s == NULL)
1381            %(fail)s
1382
1383        """ % locals()
1384        return code
1385
1386gpu_join = GpuJoin()
1387
1388
1389class GpuSplit(HideC, Split):
1390    """
1391    Split for GPU.
1392
1393    """
1394    _f16_ok = True
1395
1396    def __init__(self, len_splits):
1397        super(GpuSplit, self).__init__(len_splits)
1398        # The GPU version of Split returns splits as views of the input.
1399        self.view_map = {}
1400        for i in xrange(self.len_splits):
1401            self.view_map[i] = [0]
1402
1403    def make_node(self, x, axis, splits):
1404        node = Split.make_node(self, x, axis, splits)
1405        x = as_gpuarray_variable(x, infer_context_name(x))
1406        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable,
1407                             context_name=x.type.context_name)()
1408                for o in node.outputs]
1409        return Apply(self, [x] + node.inputs[1:], outs)
1410
1411    # we reuse the perform of the CPU op, which is suitable
1412
1413    def c_code_cache_version(self):
1414        return (2,)
1415
1416    def c_headers(self):
1417        return ['<numpy_compat.h>', '<gpuarray_helper.h>']
1418
1419    def c_header_dirs(self):
1420        return [pygpu.get_include(), gpuarray_helper_inc_dir()]
1421
1422    def c_code(self, node, name, inputs, outputs, sub):
1423        if self.len_splits == 0:
1424            # There are no outputs, then nothing to do.
1425            return ''
1426
1427        # outputs_pointers lists the addresses of the pointers to the outputs.
1428        outputs_pointers = '&' + (', &'.join(outputs))
1429        x, axis, splits = inputs
1430        fail = sub['fail']
1431        splits_dtype = node.inputs[2].type.dtype_specs()[1]
1432        axis_dtype = node.inputs[1].type.dtype_specs()[1]
1433        expected_splits_count = self.len_splits
1434
1435        main_code = """
1436        int ndim = PyGpuArray_NDIM(%(x)s);
1437        int axis = (int)(*(%(axis_dtype)s*)PyArray_GETPTR1(%(axis)s, 0));
1438        int splits_count = PyArray_DIM(%(splits)s, 0);
1439        size_t len_along_axis, sum_of_splits = 0;
1440        %(splits_dtype)s current_split_length;
1441        size_t* split_points = NULL;
1442        GpuArray* split_views = NULL;
1443        GpuArray** split_views_pointers = NULL;
1444        int i, j;
1445        PyGpuArrayObject** outputs[] = {%(outputs_pointers)s};
1446
1447        /* Check inputs. */
1448
1449        if (splits_count != %(expected_splits_count)s) {
1450            PyErr_Format(PyExc_ValueError,
1451                "GpuSplit: splits count (%%d) != expected count (%%d).", splits_count, %(expected_splits_count)s);
1452            %(fail)s
1453        }
1454        if (axis < 0) {
1455            axis += ndim;
1456        }
1457        if (axis < 0 || axis >= ndim) {
1458            PyErr_Format(PyExc_IndexError, "GpuSplit: invalid axis %%d for a %%d-D array.", axis, ndim);
1459            %(fail)s
1460        }
1461        len_along_axis = PyGpuArray_DIM(%(x)s, axis);
1462        for (i = 0; i < splits_count; ++i) {
1463            current_split_length = *(%(splits_dtype)s*)PyArray_GETPTR1(%(splits)s, i);
1464            if (current_split_length < 0) {
1465                PyErr_Format(PyExc_ValueError,
1466                    "GpuSplit: you try to take a negative number (%%ld) of elements.", current_split_length);
1467                %(fail)s
1468            }
1469            sum_of_splits += current_split_length;
1470        }
1471        if (sum_of_splits != len_along_axis) {
1472            PyErr_Format(PyExc_ValueError, "GpuSplit: the splits sums to %%ld, expected %%ld.", sum_of_splits, len_along_axis);
1473            %(fail)s
1474        }
1475
1476        /* Compute splits views. */
1477
1478        split_points = (size_t*) malloc((splits_count - 1) * sizeof(size_t));
1479        if (split_points == NULL) {
1480            PyErr_NoMemory();
1481            %(fail)s
1482        }
1483        split_points[0] = (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, 0) );
1484        for(i = 1; i < splits_count - 1; ++i) {
1485            split_points[i] = split_points[i - 1] + (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, i) );
1486        }
1487        split_views = (GpuArray*) malloc(splits_count * sizeof(GpuArray));
1488        split_views_pointers = (GpuArray**) malloc(splits_count * sizeof(GpuArray*));
1489        if (split_views == NULL || split_views_pointers == NULL) {
1490            PyErr_NoMemory();
1491            free(split_views_pointers);
1492            free(split_views);
1493            free(split_points);
1494            %(fail)s
1495        }
1496        for (i = 0; i < splits_count; ++i) {
1497            split_views_pointers[i] = split_views + i;
1498        }
1499        if (GpuArray_split(split_views_pointers, &%(x)s->ga, splits_count - 1, split_points, axis) != GA_NO_ERROR) {
1500            PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to compute split.");
1501            for (i = 0; i < splits_count; ++i) {
1502                GpuArray_clear(split_views_pointers[i]);
1503            }
1504            free(split_views_pointers);
1505            free(split_views);
1506            free(split_points);
1507            %(fail)s
1508        }
1509
1510        /* Put split views into outputs. */
1511        for (i = 0; i < splits_count; ++i) {
1512            PyGpuArrayObject** output = outputs[i];
1513            Py_XDECREF(*output);
1514            *output = pygpu_fromgpudata(
1515                split_views[i].data,
1516                split_views[i].offset,
1517                split_views[i].typecode,
1518                split_views[i].nd,
1519                split_views[i].dimensions,
1520                split_views[i].strides,
1521                %(x)s->context,
1522                1, // output is writable
1523                Py_None, Py_None
1524            );
1525            if (*output == NULL) {
1526                PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to update an output from a split view.");
1527                for (j = 0; j < splits_count; ++j) {
1528                    GpuArray_clear(split_views_pointers[j]);
1529                }
1530                free(split_views_pointers);
1531                free(split_views);
1532                free(split_points);
1533                %(fail)s
1534            }
1535        }
1536
1537        /* Free memory. */
1538        for (i = 0; i < splits_count; ++i) {
1539           GpuArray_clear(split_views_pointers[i]);
1540        }
1541        free(split_views_pointers);
1542        free(split_views);
1543        free(split_points);
1544        """
1545
1546        return main_code % locals()
1547
1548
1549@theano.compile.profiling.register_profiler_printer
1550def profile_printer(message, compile_time, fct_call_time,
1551                    apply_time, apply_cimpl, outputs_size, file):
1552    if any([x.op.__class__.__name__.lower().startswith("gpu")
1553            for x in apply_time.keys()]):
1554        local_time = sum(apply_time.values())
1555        print('', file=file)
1556        print('Some info useful for gpu:', file=file)
1557
1558        fgraphs = set()
1559        for node in apply_time.keys():
1560            fgraphs.add(node.fgraph)
1561
1562        cpu = 0
1563        gpu = 0
1564        trans = 0
1565        for node, t in iteritems(apply_time):
1566            if isinstance(node.op, (HostFromGpu, GpuFromHost)):
1567                trans += t
1568            elif node.op.__class__.__name__.lower().startswith("gpu"):
1569                gpu += t
1570            else:
1571                cpu += t
1572        print('', file=file)
1573        print("    Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op" % (
1574            cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
1575            trans, trans / local_time * 100), file=file)
1576
1577        print('', file=file)
1578        print("    Theano function input that are float64", file=file)
1579        print("    <fct name> <input name> <input type> <str input>", file=file)
1580        for fg in fgraphs:
1581            for i in fg.inputs:
1582                if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
1583                    print('        ', fg.name, i.name, i.type, i, file=file)
1584
1585        print('', file=file)
1586        print("    List of apply that don't have float64 as input but have float64 in outputs", file=file)
1587        print("    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)", file=file)
1588        print('    <Apply> <Apply position> <fct name> <inputs type> <outputs type>', file=file)
1589        for fg in fgraphs:
1590            for idx, node in enumerate(fg.toposort()):
1591                if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
1592                        for i in node.outputs) and
1593                    not any(hasattr(i, 'dtype') and i.dtype == 'float64'
1594                            for i in node.inputs)):
1595
1596                    print('        ', str(node), idx, fg.name, end=' ',
1597                          file=file)
1598                    print(str([getattr(i, 'dtype', None)
1599                               for i in node.inputs]), end=' ', file=file)
1600                    print(str([getattr(i, 'dtype', None)
1601                               for i in node.outputs]), file=file)
1602        print('', file=file)
1603
1604
1605class GpuEye(GpuKernelBase, Op):
1606    """
1607    Eye for GPU.
1608
1609    """
1610    __props__ = ('dtype', 'context_name')
1611    _f16_ok = True
1612
1613    def __init__(self, dtype=None, context_name=None):
1614        if dtype is None:
1615            dtype = config.floatX
1616        self.dtype = dtype
1617        self.context_name = context_name
1618
1619    def get_params(self, node):
1620        return get_context(self.context_name)
1621
1622    def make_node(self, n, m, k):
1623        n = tensor.as_tensor_variable(n)
1624        m = tensor.as_tensor_variable(m)
1625        k = tensor.as_tensor_variable(k)
1626        assert n.ndim == 0
1627        assert m.ndim == 0
1628        assert k.ndim == 0
1629        otype = GpuArrayType(dtype=self.dtype,
1630                             broadcastable=(False, False),
1631                             context_name=self.context_name)
1632
1633        return Apply(self, [n, m, k], [otype()])
1634
1635    def infer_shape(self, node, in_shapes):
1636        out_shape = [node.inputs[0], node.inputs[1]]
1637        return [out_shape]
1638
1639    def grad(self, inp, grads):
1640        return [grad_undefined(self, i, inp[i])
1641                for i in xrange(3)]
1642
1643    def gpu_kernels(self, node, name):
1644        code = """#include "cluda.h"
1645
1646KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
1647                ga_size n, ga_size m, ga_ssize k) {
1648    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
1649    ga_ssize coff = max(k, (ga_ssize) 0);
1650    ga_ssize roff = -min(k, (ga_ssize) 0);
1651    ga_size nb = (ga_size) min(n - roff, m - coff);
1652    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
1653        a[(i + roff)*m + i + coff] = %(write_a)s(1);
1654    }
1655}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
1656            name=name, write_a=write_w(self.dtype))
1657        return [Kernel(
1658                code=code, name="eye",
1659                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
1660                        gpuarray.SIZE, gpuarray.SSIZE],
1661                flags=Kernel.get_flags(self.dtype),
1662                objvar='k_eye_' + name)]
1663
1664    def c_code(self, node, name, inp, out, sub):
1665        if len(inp) == 2:
1666            n, m = inp
1667            k = 0
1668        elif len(inp) == 3:
1669            n, m, k = inp
1670
1671        z, = out
1672        fail = sub['fail']
1673        ctx = sub['params']
1674        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
1675        kname = self.gpu_kernels(node, name)[0].objvar
1676        s = """
1677        size_t dims[2] = {0, 0};
1678        size_t ls, gs;
1679        ssize_t k;
1680        size_t col_off;
1681        size_t row_off;
1682        int err;
1683
1684        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
1685        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
1686        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
1687
1688        Py_CLEAR(%(z)s);
1689
1690        %(z)s = pygpu_zeros(2, dims,
1691                            %(typecode)s,
1692                            GA_C_ORDER,
1693                            %(ctx)s, Py_None);
1694        if (%(z)s == NULL) {
1695            %(fail)s
1696        }
1697
1698        ls = 1;
1699        gs = 256;
1700        col_off = (size_t) (k > 0?k:0);
1701        row_off = (size_t) (k < 0?-k:0);
1702        if (row_off < dims[0] && col_off < dims[1]) {
1703            err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
1704                           dims[0], dims[1], k);
1705            if (err != GA_NO_ERROR) {
1706                PyErr_Format(PyExc_RuntimeError,
1707                             "gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
1708                             GpuKernel_error(&%(kname)s, err),
1709                             (unsigned long)dims[0], (unsigned long)dims[1]);
1710                %(fail)s;
1711            }
1712        }
1713
1714        """ % locals()
1715
1716        return s
1717
1718    def c_code_cache_version(self):
1719        return (10,)
1720
1721
1722class GpuTri(GpuKernelBase, Op):
1723    """
1724    Tri for GPU.
1725
1726    """
1727    __props__ = ('dtype', 'context_name')
1728    _f16_ok = True
1729
1730    def __init__(self, dtype=None, context_name=None):
1731        if dtype is None:
1732            dtype = config.floatX
1733        self.dtype = dtype
1734        self.context_name = context_name
1735
1736    def get_params(self, node):
1737        return get_context(self.context_name)
1738
1739    def make_node(self, n, m, k):
1740        n = tensor.as_tensor_variable(n)
1741        m = tensor.as_tensor_variable(m)
1742        k = tensor.as_tensor_variable(k)
1743        assert n.ndim == 0
1744        assert m.ndim == 0
1745        assert k.ndim == 0
1746        otype = GpuArrayType(dtype=self.dtype,
1747                             broadcastable=(False, False),
1748                             context_name=self.context_name)
1749
1750        return Apply(self, [n, m, k], [otype()])
1751
1752    def infer_shape(self, node, in_shapes):
1753        out_shape = [node.inputs[0], node.inputs[1]]
1754        return [out_shape]
1755
1756    def grad(self, inp, grads):
1757        return [grad_undefined(self, i, inp[i])
1758                for i in xrange(3)]
1759
1760    def gpu_kernels(self, node, name):
1761        code = """#include "cluda.h"
1762
1763KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
1764                ga_size n, ga_size m, ga_ssize k) {
1765    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
1766    ga_ssize coff = max(k, (ga_ssize) 0);
1767    ga_ssize roff = -min(k, (ga_ssize) 0);
1768    for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
1769        for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
1770          a[(i + roff)*m + j] = %(write_a)s(1);
1771        }
1772    }
1773}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
1774            name=name, write_a=write_w(self.dtype))
1775        return [Kernel(
1776                code=code, name="tri",
1777                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
1778                        gpuarray.SIZE, gpuarray.SSIZE],
1779                flags=Kernel.get_flags(self.dtype),
1780                objvar='k_tri_' + name)]
1781
1782    def c_code(self, node, name, inp, out, sub):
1783        if len(inp) == 2:
1784            n, m = inp
1785            k = 0
1786        elif len(inp) == 3:
1787            n, m, k = inp
1788
1789        z, = out
1790        fail = sub['fail']
1791        ctx = sub['params']
1792        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
1793        kname = self.gpu_kernels(node, name)[0].objvar
1794        s = """
1795        size_t dims[2] = {0, 0};
1796        size_t ls, gs;
1797        ssize_t k;
1798        int err;
1799
1800        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
1801        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
1802        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
1803
1804        Py_CLEAR(%(z)s);
1805
1806        %(z)s = pygpu_zeros(2, dims,
1807                            %(typecode)s,
1808                            GA_C_ORDER,
1809                            %(ctx)s, Py_None);
1810        if (%(z)s == NULL) {
1811            %(fail)s
1812        }
1813
1814        ls = 1;
1815        gs = 256;
1816        err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
1817                       dims[0], dims[1], k);
1818        if (err != GA_NO_ERROR) {
1819            PyErr_Format(PyExc_RuntimeError,
1820                         "gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
1821                         GpuKernel_error(&%(kname)s, err),
1822                         (unsigned long)dims[0], (unsigned long)dims[1]);
1823            %(fail)s;
1824        }
1825        """ % locals()
1826
1827        return s
1828
1829    def c_code_cache_version(self):
1830        return (1,)
1831