1from __future__ import absolute_import, print_function, division
2
3import numpy as np
4from six import integer_types
5from six.moves import StringIO, xrange
6
7from theano import tensor, gof, Op
8from theano.gof import ParamsType
9from theano.gradient import grad_not_implemented
10import theano.tensor as T
11from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
12from theano.tensor import AllocDiag
13from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
14
15try:
16    import pygpu
17    from pygpu import gpuarray
18except ImportError:
19    pass
20
21from .type import GpuArrayType, gpu_context_type
22from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, gpuarray_helper_inc_dir,
23                        infer_context_name, gpu_contiguous)
24
25iadd_reg = {}
26
27
28def get_iadd(a, b):
29    key = (a.type.dtype, b.type.dtype, a.type.context)
30    if key not in iadd_reg:
31        a_arg = pygpu.elemwise.arg('a', a.type.dtype, read=True, write=True)
32        b_arg = pygpu.elemwise.arg('b', b.type.dtype, read=True)
33        res = pygpu.elemwise.GpuElemwise(a.type.context, "a = a + b", [a_arg, b_arg], convert_f16=True)
34        iadd_reg[key] = res
35    return iadd_reg[key]
36
37
38class GpuSubtensor(HideC, Subtensor):
39    """
40    Subtensor on the GPU.
41    """
42    _f16_ok = True
43
44    def make_node(self, x, *inputs):
45        ctx_name = infer_context_name(x)
46        rval = tensor.Subtensor.make_node(self, x, *inputs)
47        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
48                             broadcastable=rval.outputs[0].type.broadcastable,
49                             context_name=ctx_name)
50        x = as_gpuarray_variable(x, ctx_name)
51        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
52
53    def perform(self, node, inputs, out_):
54        out, = out_
55        x = inputs[0]
56
57        cdata = get_idx_list(inputs, self.idx_list)
58        if len(cdata) == 1:
59            cdata = cdata[0]
60
61        out[0] = x.__getitem__(cdata)
62
63    def c_support_code(self):
64        return """
65        static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
66                               int start_n, int stop_n, int step_n,
67                               size_t len) {
68            if (step_n) *step = 1;
69            if (*step == 0) {
70                PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
71                return -1;
72            }
73            if (start_n) *start = (*step < 0) ? len-1 : 0;
74            else {
75                if (*start < 0) *start += len;
76                if (*start < 0) *start = (*step < 0) ? -1 : 0;
77                if (*start > -1 && *start >= len) {
78                    *start = (*step < 0) ? len-1 : len;
79                }
80            }
81
82            if (stop_n) *stop = (*step < 0) ? -1 : len;
83            else {
84                if (*stop < 0) *stop += len;
85                if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
86                if (*stop > -1 && *stop >= len) {
87                    *stop = (*step < 0) ? len-1 : len;
88                }
89            }
90            if (*stop < *start && *step > 0)
91                *stop = *start;
92            return 0;
93        }
94        """
95
96    def c_code(self, node, name, inputs, outputs, sub):
97        inp_ndim = node.inputs[0].ndim
98        inp = inputs[0]
99        indices = inputs[1:]
100
101        # pad out the index list to the same dimension as the input
102        idx_list = self.idx_list + \
103            ((slice(None),) * (inp_ndim - len(self.idx_list)))
104
105        # This case fails when we use pygpu_index(), so here is some
106        # special code
107        if len(idx_list) == 0:
108            return """
109        Py_XDECREF(%(out)s);
110        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
111        if (!%(out)s) {
112            // Exception already set
113            %(fail)s
114        }
115""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
116
117        sio = StringIO()
118        print("""
119        ssize_t starts[%(sz)s];
120        ssize_t stops[%(sz)s];
121        ssize_t steps[%(sz)s];
122        ssize_t cur;
123        int err;
124
125        if (%(inp)s->ga.nd != %(sz)s) {
126            PyErr_SetString(PyExc_IndexError, "invalid index");
127            %(fail)s
128        }
129        """ % dict(sz=len(idx_list), inp=inp, fail=sub['fail']), file=sio)
130
131        def fix_idx(idx):
132            if idx is None:
133                return "0", 1
134            elif isinstance(idx, (np.integer, integer_types)):
135                return str(idx), 0
136            elif isinstance(idx, gof.Type):
137                return indices.pop(0), 0
138            else:
139                assert 0, idx
140
141        for i, idx in enumerate(idx_list):
142            if isinstance(idx, slice):
143                start, start_n = fix_idx(idx.start)
144                stop, stop_n = fix_idx(idx.stop)
145                step, step_n = fix_idx(idx.step)
146                print("""
147                starts[%(i)s] = %(start)s;
148                stops[%(i)s] = %(stop)s;
149                steps[%(i)s] = %(step)s;
150                if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
151                                %(start_n)s, %(stop_n)s, %(step_n)s,
152                                %(inp)s->ga.dimensions[%(i)s]) == -1) {
153                    %(fail)s
154                }
155                """ % dict(i=i, start=start, stop=stop, step=step,
156                           start_n=start_n, stop_n=stop_n, step_n=step_n,
157                           fail=sub['fail'], inp=inp), file=sio)
158            else:
159                if isinstance(idx, gof.Type):
160                    start = indices.pop(0)
161                elif isinstance(idx, (np.integer, integer_types)):
162                    start = idx
163                else:
164                    assert 0, idx
165                print("""
166                cur = %(start)s;
167                if (cur < 0)
168                    cur += %(inp)s->ga.dimensions[%(i)s];
169                starts[%(i)s] = cur;
170                steps[%(i)s] = 0;
171                """ % dict(i=i, start=start, fail=sub['fail'], inp=inp), file=sio)
172
173        print("""
174        Py_XDECREF(%(out)s);
175        %(out)s = pygpu_index(%(inp)s, starts, stops, steps);
176        if (!%(out)s) { %(fail)s }
177""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0]), file=sio)
178
179        return sio.getvalue()
180
181    def c_code_cache_version(self):
182        return (8,)
183
184
185class GpuIncSubtensor(IncSubtensor):
186    """
187    Implement IncSubtensor on the gpu.
188
189    Notes
190    -----
191    The optimization to make this inplace is in tensor/opt.
192    The same optimization handles IncSubtensor and GpuIncSubtensor.
193    This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
194    The helper methods like :meth:`do_type_checking`,
195    :meth:`copy_of_x`, etc. specialize the c_code for this Op.
196
197    """
198    _f16_ok = True
199    params_type = gpu_context_type
200
201    def make_node(self, x, y, *inputs):
202        ctx_name = infer_context_name(x, y)
203        x = as_gpuarray_variable(x, ctx_name)
204        y = as_gpuarray_variable(y, ctx_name)
205        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
206        ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()])
207        return ret
208
209    def get_params(self, node):
210        return node.outputs[0].type.context
211
212    def perform(self, node, inputs, out_, ctx):
213        out, = out_
214        x, y = inputs[:2]
215        indices = list(reversed(inputs[2:]))
216
217        def convert(entry):
218            if isinstance(entry, gof.Type):
219                rval = indices.pop()
220                return rval
221            elif isinstance(entry, slice):
222                return slice(convert(entry.start),
223                             convert(entry.stop),
224                             convert(entry.step))
225            else:
226                return entry
227
228        cdata = tuple(map(convert, self.idx_list))
229        if len(cdata) == 1:
230            cdata = cdata[0]
231        if not self.inplace:
232            x = x.copy()
233        sub_x = x.__getitem__(cdata)
234        if sub_x.shape:
235            # we've sliced out an N-D tensor with N > 0
236            if not self.set_instead_of_inc:
237                # sub_x += y
238                iadd = get_iadd(node.inputs[0], node.inputs[1])
239                iadd(sub_x, y)
240            else:
241                # sub_x[...] = y
242                x.__setitem__(cdata, y)
243        else:
244            # scalar case
245            if not self.set_instead_of_inc:
246                # x.__setitem__(cdata, sub_x + y)
247                tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x,
248                                               broadcast=False)
249                x.__setitem__(cdata, tmp)
250            else:
251                x.__setitem__(cdata, y)
252        out[0] = x
253
254    def do_type_checking(self, node):
255        """
256        Should raise NotImplementedError if c_code does not support
257        the types involved in this node.
258
259        """
260
261        if not isinstance(node.inputs[0].type, GpuArrayType):
262            raise NotImplementedError()
263
264    def copy_of_x(self, x):
265        """
266
267        Parameters
268        ----------
269        x
270            A string giving the name of a C variable pointing to an array.
271
272        Returns
273        -------
274        str
275            C code expression to make a copy of x.
276
277        Notes
278        -----
279        Base class uses `PyArrayObject *`, subclasses may override for
280        different types of arrays.
281
282        """
283        return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
284
285    def decl_view(self):
286        return "PyGpuArrayObject* zview = NULL;"
287
288    def make_view_array(self, x, view_ndim):
289        """
290        //TODO
291
292        Parameters
293        ----------
294        x
295            A string identifying an array to be viewed.
296        view_ndim
297            A string specifying the number of dimensions to have in the view.
298            This doesn't need to actually set up the view with the
299            right indexing; we'll do that manually later.
300
301        """
302        ret = """
303        size_t dims[%(view_ndim)s];
304        for(int i=0; i<%(view_ndim)s; i++)
305            dims[i] = xview_dims[i];
306
307        zview = pygpu_fromgpudata(%(x)s->ga.data,
308                                  %(x)s->ga.offset + xview_offset,
309                                  %(x)s->ga.typecode,
310                                  %(view_ndim)s,
311                                  dims,
312                                  xview_strides,
313                                  %(x)s->context,
314                                  1,
315                                  (PyObject *)%(x)s,
316                                  (PyObject *)&PyGpuArrayType);
317        """ % locals()
318        return ret
319
320    def get_helper_c_code_args(self):
321        """
322        Return a dictionary of arguments to use with helper_c_code.
323
324        """
325        return {'c_prefix': 'PyGpuArray',
326                'strides_mul': 1
327                }
328
329    def copy_into(self, view, source):
330        """
331
332        Parameters
333        ----------
334        view : string
335            C code expression for an array.
336        source : string
337            C code expression for an array.
338
339        Returns
340        -------
341        str
342            C code expression to copy source into view, and 0 on success.
343
344        """
345        return """sub_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
346
347    def c_headers(self):
348        return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
349                '<gpuarray/elemwise.h>']
350
351    def c_support_code(self):
352        return """
353int sub_setarray(GpuArray *dst, GpuArray *src) {
354  int err;
355  err = GpuArray_setarray(dst, src);
356  if (err != GA_NO_ERROR)
357    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err));
358  return err;
359}
360"""
361
362    def c_support_code_struct(self, node, nodename):
363        return "\nGpuElemwise *iadd;\n"
364
365    def c_init_code_struct(self, node, name, sub):
366        return """
367        gpuelemwise_arg args[2] = {{0}};
368        args[0].name = "a";
369        args[0].typecode = %(type1)s;
370        args[0].flags = GE_READ|GE_WRITE;
371        args[1].name = "b";
372        args[1].typecode = %(type2)s;
373        args[1].flags = GE_READ;
374        iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
375                               2, args, %(nd)s, GE_CONVERT_F16);
376        if (iadd == NULL) {
377          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
378          %(fail)s
379        }
380        """ % dict(ctx=sub['params'], fail=sub['fail'],
381                   type1=node.inputs[0].type.typecode,
382                   type2=node.inputs[1].type.typecode,
383                   nd=node.inputs[1].ndim)
384
385    def add_to_zview(self, nodename, x, fail):
386        return """
387        {
388          void *args[2];
389          args[0] = &zview->ga;
390          args[1] = &%(x)s->ga;
391          if (GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE) != GA_NO_ERROR) {
392            PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
393            Py_DECREF(zview);
394            %(fail)s
395          }
396        }
397        """ % locals()
398
399    def c_code_cache_version(self):
400        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
401        if not parent_version:
402            return
403        return parent_version + (10,)
404
405
406class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
407    """
408    AdvancedSubrensor1 on the GPU.
409    """
410    _f16_ok = True
411
412    def make_node(self, x, ilist):
413        ctx_name = infer_context_name(x, ilist)
414        x_ = as_gpuarray_variable(x, ctx_name)
415
416        ilist__ = tensor.as_tensor_variable(ilist)
417        if ilist__.type.dtype not in tensor.integer_dtypes:
418            raise TypeError('index must be integers')
419        if ilist__.type.dtype != 'int64':
420            ilist__ = tensor.cast(ilist__, 'int64')
421
422        ilist_ = gpu_contiguous(as_gpuarray_variable(ilist__, ctx_name))
423
424        if ilist_.type.dtype != 'int64':
425            raise TypeError('index must be int64')
426        if ilist_.type.ndim != 1:
427            raise TypeError('index must be a vector')
428        if x_.type.ndim == 0:
429            raise TypeError('cannot index into a scalar')
430
431        bcast = ilist_.broadcastable + x_.broadcastable[1:]
432        return gof.Apply(self, [x_, ilist_],
433                         [GpuArrayType(dtype=x.dtype,
434                                       context_name=ctx_name,
435                                       broadcastable=bcast)()])
436
437    def perform(self, node, inp, out_):
438        raise NotImplementedError()
439
440    def c_support_code(self):
441        return """
442int take1_match_dims(GpuArray *a, GpuArray *v) {
443  if (a->nd != v->nd) return 0;
444  for (unsigned int i = 1; i < v->nd; i++) {
445    if (a->dimensions[i] != v->dimensions[i]) return 0;
446  }
447  return 1;
448}
449"""
450
451    def c_code(self, node, name, inputs, outputs, sub):
452        return """
453int err;
454if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
455    %(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] ||
456    !take1_match_dims(&%(out)s->ga, &%(v)s->ga)) {
457  size_t tmp;
458  Py_XDECREF(%(out)s);
459
460  /* This is a dirty hack to avoid an extra alloc */
461  tmp = %(v)s->ga.dimensions[0];
462  %(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0];
463  %(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode,
464                        GA_C_ORDER, %(v)s->context, Py_None);
465  if (%(out)s == NULL) {
466    %(fail)s;
467  }
468  %(v)s->ga.dimensions[0] = tmp; // Don't remove this line
469}
470
471err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1);
472if (err != GA_NO_ERROR) {
473  if (err == GA_VALUE_ERROR) {
474    PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
475  } else {
476    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(&%(v)s->ga, err));
477  }
478  %(fail)s
479}
480""" % dict(out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub['fail'])
481
482    def c_code_cache_version(self):
483        return (1,)
484
485
486def check_and_convert_boolean_masks(input, idx_list):
487    """
488    This function checks if the boolean mask arrays in the index have
489    the right shape and converts them to index arrays by calling nonzero.
490    For each boolean mask, we check if the mask has the
491    same shape as the input. This is enforced in NumPy 0.13.0 and
492    newer, but not by earlier versions. If the size is not the same,
493    this method raises an IndexError.
494    """
495    dim_seen = 0
496    out_idx_list = []
497    for index in idx_list:
498        if index is np.newaxis:
499            # skip, does not count as an input dimension
500            out_idx_list.append(index)
501        elif isinstance(index, np.ndarray) and index.dtype == 'bool':
502            for i in xrange(index.ndim):
503                if index.shape[i] != input.shape[dim_seen + i]:
504                    raise IndexError('boolean index did not match indexed array '
505                                     'along dimension %d; dimension is %d but '
506                                     'corresponding boolean dimension is %d' %
507                                     (dim_seen + i, input.shape[dim_seen + i],
508                                      index.shape[i]))
509            dim_seen += index.ndim
510            out_idx_list += index.nonzero()
511        else:
512            dim_seen += 1
513            out_idx_list.append(index)
514    return out_idx_list
515
516
517class BaseGpuAdvancedSubtensor(object):
518    def perform(self, node, inputs, out_):
519        out, = out_
520        x = inputs[0]
521        idx = inputs[1:]
522
523        # convert boolean masks to index arrays
524        idx = check_and_convert_boolean_masks(x, idx)
525
526        # detect and transpose array indices
527        nidx = []
528        nshp = list(x.shape)
529        for k, i in enumerate(idx):
530            if i is None:
531                nidx.append(slice(None))
532                nshp.insert(k, 1)
533            else:
534                nidx.append(i)
535
536        x = x.reshape(nshp)
537
538        transp = list(range(x.ndim))
539        # number of array-indexed dimensions
540        p = 0
541        # ap represents the axis in the resulting array where the
542        # dimensions indexed by arrays and ints will be inserted.
543        # For instance, if all such dimensions are grouped together,
544        # it corresponds to the index of the first such dimension in the
545        # initial array.  If these dimensions are split (with slices
546        # between), then the resulting dimensions will be moved to the
547        # beginning, and ap will be 0.
548        # If no such dimension has been encountered, ap is None.
549        ap = None
550        # Indicates whether we have already encountered an index (array
551        # or number), and then a slice.
552        slice_after_idx = False
553        for k, i in enumerate(list(nidx)):
554            if (isinstance(i, np.ndarray) and i.ndim != 0):
555                transp.remove(k)
556                transp.insert(p, k)
557                i = nidx.pop(k)
558                nidx.insert(p, i)
559                p += 1
560                if ap is None:
561                    # first non-slice index
562                    ap = k
563                elif slice_after_idx:
564                    # We already encountered at least an array or int, and then
565                    # a slice. Array-indexed axes are not grouped,
566                    # moving to the beginning
567                    ap = 0
568            else:
569                try:
570                    i.__index__()
571                    if ap is None:
572                        ap = k
573                    # indices do not break the contiguity of
574                    # array-indexed axes
575                except Exception:
576                    # If we already encountered an array/int index, it
577                    # means future ones will not be grouped.
578                    if ap is not None:
579                        slice_after_idx = True
580
581        x = x.transpose(*transp)
582
583        idx_ = ([slice(None)] * p + nidx[p:])
584        x = x.__getitem__(idx_)
585
586        if p == 0:
587            assert ap is None
588            # The only indexing was through slices and indices.
589            # This can happen with symbolic slices for instance.
590            # Since no view_map is set, we need to copy the returned value
591            out[0] = x.copy()
592            return
593
594        # At this point, we should have encountered at least one array
595        assert ap is not None
596
597        # flatten the array-indexed dimensions
598        shape = ((np.prod(x.shape[0: p]),) +
599                 x.shape[p:])
600        input_flat = x.reshape(shape)
601
602        # build the strides
603        strides = [1]
604        for i in range(p - 1, 0, -1):
605            stride = x.shape[i] * strides[0]
606            strides.insert(0, stride)
607
608        # build the indices and use it
609        take_idx = sum((i * s for i, s in zip(nidx, strides)))
610        out_flat = input_flat.take1(pygpu.asarray(take_idx.flatten(),
611                                                  context=x.context))
612
613        # finish up
614        out_flat_shp = take_idx.shape + x.shape[p:]
615        o = out_flat.reshape(out_flat_shp)
616
617        if ap != 0:
618            # Put the resulting indexing at the place that NumPy
619            # decided was the right one.
620            ntransp = list(range(take_idx.ndim, o.ndim))
621            ntransp[ap:ap] = list(range(take_idx.ndim))
622            o = o.transpose(*ntransp)
623
624        out[0] = o
625
626
627class GpuAdvancedSubtensor(HideC, BaseGpuAdvancedSubtensor, tensor.AdvancedSubtensor):
628    """
629    AdvancedSubtensor on the GPU.
630    """
631    def make_node(self, x, *inputs):
632        ctx_name = infer_context_name(x)
633        # This method relies on AdvancedSubtensor.make_node to
634        # call tensor.subtensor.check_and_reject_bool(inputs),
635        # which raises an IndexError if there are any boolean indices.
636        rval = tensor.AdvancedSubtensor.make_node(self, x, *inputs)
637        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
638                             broadcastable=rval.outputs[0].type.broadcastable,
639                             context_name=ctx_name)
640        x = as_gpuarray_variable(x, ctx_name)
641        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
642
643
644class GpuAdvancedBooleanSubtensor(HideC, BaseGpuAdvancedSubtensor, tensor.AdvancedBooleanSubtensor):
645    """
646    AdvancedBooleanSubtensor on the GPU.
647    """
648    def make_node(self, x, *inputs):
649        ctx_name = infer_context_name(x)
650        rval = tensor.AdvancedBooleanSubtensor.make_node(self, x, *inputs)
651        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
652                             broadcastable=rval.outputs[0].type.broadcastable,
653                             context_name=ctx_name)
654        x = as_gpuarray_variable(x, ctx_name)
655        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
656
657
658class BaseGpuAdvancedIncSubtensor(object):
659    def perform(self, node, inp, out_):
660        out, = out_
661        x = inp[0]
662        y = inp[1]
663        idx = inp[2:]
664        x = x.copy()
665        # Get a handle to the GpuElemwise object that will be called.
666        # It is not necessary to have the right number of dimensions,
667        # so we just pass symbolic x and y.
668        iadd = get_iadd(node.inputs[0], node.inputs[1])
669
670        # convert all indices to np.array
671        for i in range(len(idx)):
672            if isinstance(idx[i], gpuarray.GpuArray):
673                idx[i] = np.asarray(idx[i])
674
675        # convert boolean masks to index arrays
676        idx = check_and_convert_boolean_masks(x, idx)
677
678        # Insert axes for None indexing
679        nidx = []
680        nshp = list(x.shape)
681        for k, i in enumerate(idx):
682            if i is None:
683                nidx.append(slice(None))
684                nshp.insert(k, 1)
685            else:
686                nidx.append(i)
687
688        x_ = x.reshape(nshp)
689
690        # Bring array indices to front
691        transp = []
692        nidx_ = []
693        p = 0
694        for k, i in enumerate(list(nidx)):
695            if isinstance(i, np.ndarray) and i.ndim != 0:
696                transp.append(k)
697                nidx_.append(i)
698                p += 1
699        for k, i in enumerate(list(nidx)):
700            if not (isinstance(i, np.ndarray) and i.ndim != 0):
701                transp.append(k)
702                nidx_.append(i)
703        transp = transp + list(range(len(transp), x_.ndim))
704        rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x:x[1])]
705        nidx = nidx_
706
707        # transp: order to shuffle axes of x so that single dimension
708        #         subarrays are extracted first
709        # p: number of axes with array indexing
710        x_ = x_.transpose(*transp)
711        idx_ = ([slice(None)] * p + nidx[p:])
712        # flatten the array-indexed dimensions
713        x_flat = x_.reshape((np.prod(x_.shape[0: p]),) + x_.shape[p:])
714        # process y so that last axes are the same
715        if y.shape != (1,):
716            y_shape_reverse = []
717            for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]):
718                if x_s == y_s or y_s == 1:
719                    y_shape_reverse.append(y_s)
720                else:
721                    break
722            if np.prod(y_shape_reverse) < np.prod(y.shape):
723                if len(y_shape_reverse) > 0:
724                    y_shape_reverse.append(
725                        int(np.prod(y.shape[0:-len(y_shape_reverse)])))
726                else:
727                    y_shape_reverse.append(int(np.prod(y.shape)))
728
729            y_shape = y_shape_reverse[::-1]
730            y_flat = y.reshape(y_shape)
731        else:
732            y_flat = y[0]
733
734        # build the strides
735        strides = [1]
736        for i in range(p - 1, 0, -1):
737            stride = x_.shape[i] * strides[0]
738            strides.insert(0, stride)
739
740        # build the indices and use it
741        index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1)
742        take_idx = sum(i * s for i, s in zip(nidx, strides))
743        if index == []:
744            for j, i in enumerate(take_idx.flatten()):
745                if y_flat.shape == ():
746                    val = y_flat
747                else:
748                    val = y_flat[j]
749
750                iadd(x_flat[i], val, broadcast=True)
751        else:
752            if (x_flat.shape[-len(y_flat.shape):] == y_flat.shape or
753                    y_flat.shape == ()):
754                # y_flat has to be broadcast over axes of x_flat[i]
755
756                for i in take_idx.flatten():
757                    if len(idx_[p:]) > 0:
758                        x_flat_sub = x_flat[i].__getitem__(index)
759                    else:
760                        x_flat_sub = x_flat[i]
761                    iadd(x_flat_sub, y_flat, broadcast=True)
762            else:
763                # y_flat's first axis corresponds to first exist of x_flat
764                for j, i in enumerate(take_idx.flatten()):
765                    if len(idx_[p:]) > 0:
766                        x_flat_sub = x_flat[i].__getitem__(index)
767                    else:
768                        x_flat_sub = x_flat[i]
769                    iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
770        x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
771        out[0] = x_
772
773
774class GpuAdvancedIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, tensor.AdvancedIncSubtensor):
775    """
776    Implement AdvancedIncSubtensor on the gpu.
777
778    """
779    def make_node(self, x, y, *inputs):
780        ctx_name = infer_context_name(x, y)
781        rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs)
782        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
783                             broadcastable=rval.outputs[0].type.broadcastable,
784                             context_name=ctx_name)
785        x = as_gpuarray_variable(x, ctx_name)
786        y = as_gpuarray_variable(y, ctx_name)
787        return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()])
788
789
790class GpuAdvancedBooleanIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, tensor.AdvancedBooleanIncSubtensor):
791    """
792    Implement AdvancedBooleanIncSubtensor on the gpu.
793
794    """
795    def make_node(self, x, y, *inputs):
796        ctx_name = infer_context_name(x, y)
797        rval = tensor.AdvancedBooleanIncSubtensor.make_node(self, x, y, *inputs)
798        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
799                             broadcastable=rval.outputs[0].type.broadcastable,
800                             context_name=ctx_name)
801        x = as_gpuarray_variable(x, ctx_name)
802        y = as_gpuarray_variable(y, ctx_name)
803        return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()])
804
805
806class GpuAdvancedIncSubtensor1(Op):
807    """
808    Implement AdvancedIncSubtensor1 on the gpu.
809
810    """
811    _f16_ok = True
812    __props__ = ('inplace', 'set_instead_of_inc')
813    params_type = ParamsType(inplace=bool_t,
814                             set_instead_of_inc=bool_t,
815                             context=gpu_context_type,
816                             # following params are used into c_init_code_struct(),
817                             # as inputs are not available in that function.
818                             ndim_input_0=size_t,
819                             ndim_input_1=size_t,
820                             typecode_input_0=int_t,
821                             typecode_input_1=int_t)
822
823    def __init__(self, inplace=False, set_instead_of_inc=False):
824        self.inplace = inplace
825        self.set_instead_of_inc = set_instead_of_inc
826        if inplace:
827            self.destroy_map = {0: [0]}
828
829    def clone_inplace(self):
830        return self.__class__(
831            inplace=True,
832            set_instead_of_inc=self.set_instead_of_inc)
833
834    def make_node(self, x, y, ilist):
835        ctx_name = infer_context_name(x, y)
836        x_ = as_gpuarray_variable(x, ctx_name)
837        y_ = as_gpuarray_variable(y, ctx_name)
838        ilist_ = tensor.as_tensor_variable(ilist)
839
840        assert x_.type.ndim >= y_.type.ndim
841
842        if ilist_.type.dtype not in tensor.integer_dtypes:
843            raise TypeError('index must be integers')
844        if ilist_.type.ndim != 1:
845            raise TypeError('index must be vector')
846        if x_.type.ndim == 0:
847            raise TypeError('cannot index into a scalar')
848        if y_.type.ndim > x_.type.ndim:
849            if self.set_instead_of_inc:
850                opname = 'set'
851            else:
852                opname = 'increment'
853            raise TypeError(
854                'cannot %s x subtensor with ndim=%s by y with ndim=%s ' % (
855                    opname, x_.type.ndim, y_.type.ndim))
856
857        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
858
859    def get_params(self, node):
860        return self.params_type.get_params(self, context=node.outputs[0].type.context,
861                                           # following params are used into c_init_code_struct().
862                                           ndim_input_0=node.inputs[0].ndim,
863                                           ndim_input_1=node.inputs[1].ndim,
864                                           typecode_input_0=node.inputs[0].type.typecode,
865                                           typecode_input_1=node.inputs[1].type.typecode)
866
867    # We can't use the parent version that loops on each index
868    # as we also need to loop when set_instead_of_inc is True and the
869    # parent doesn't loop in that case.
870    def perform(self, node, inp, out_, params=None):
871        # TODO opt to make this inplace
872        x, y, idx = inp
873        out, = out_
874
875        if not self.inplace:
876            x = x.copy()
877
878        out[0] = x
879
880        if len(idx) == 0:
881            return
882
883        # Make sure idx is not a GpuArray otherwise we cannot use its
884        # content to index x and y (This is because we serve as
885        # fallback for _dev20).
886        if isinstance(idx, gpuarray.GpuArray):
887            idx = np.asarray(idx)
888
889        # If `y` has as many dimensions as `x`, then we want to iterate
890        # jointly on `x` and `y`. Otherwise, it means `y` should be
891        # broadcasted to fill all relevant rows of `x`.
892        if y.ndim == x.ndim and y.shape[0] != 1:
893            assert len(y) == len(idx)
894            if self.set_instead_of_inc:
895                for (j, i) in enumerate(idx):
896                    x[i] = y[j]
897            else:
898                k = get_iadd(node.inputs[0], node.inputs[1])
899                for (j, i) in enumerate(idx):
900                    k(x[i], y[j], broadcast=True)
901        else:
902            if y.ndim == x.ndim:
903                # First dim is always 1 in this case.
904                reshaped_y = y.reshape(y.shape[1:])
905            else:
906                nb_dims_to_add = (x.ndim - 1) - y.ndim
907                reshaped_y = y.reshape((1,) * nb_dims_to_add + y.shape)
908
909            if self.set_instead_of_inc:
910                for i in idx:
911                    x[i] = reshaped_y
912            else:
913                k = get_iadd(node.inputs[0], node.inputs[1])
914                for i in idx:
915                    k(x[i], reshaped_y, broadcast=True)
916
917    def c_headers(self):
918        return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
919                '<gpuarray/elemwise.h>', 'gpuarray_helper.h']
920
921    def c_header_dirs(self):
922        return [gpuarray_helper_inc_dir()]
923
924    def c_support_code_struct(self, node, nodename):
925        return "\nGpuElemwise *iadd;\n"
926
927    def c_init_code_struct(self, node, name, sub):
928        return """
929        gpuelemwise_arg args[2] = {{0}};
930        args[0].name = "a";
931        args[0].typecode = %(params)s->typecode_input_0;
932        args[0].flags = GE_READ|GE_WRITE;
933        args[1].name = "b";
934        args[1].typecode = %(params)s->typecode_input_1;
935        args[1].flags = GE_READ;
936        iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
937                               2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
938        if (iadd == NULL) {
939          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
940          %(fail)s
941        }
942        """ % dict(params=sub['params'], fail=sub['fail'])
943
944    def c_code(self, node, name, inputs, outputs, sub):
945        if (node.inputs[0].ndim != node.inputs[1].ndim):
946            raise NotImplementedError("This case does not have C code yet.")
947
948        return """
949        PyGpuArrayObject *row_x, *row_y;
950        size_t nd = %(params)s->ndim_input_0;
951        ssize_t *start = NULL, *step = NULL;
952        size_t num_indices, j;
953        int ret;
954        int broadcast_y;
955
956        start = (ssize_t*)malloc(nd * sizeof(ssize_t));
957        step = (ssize_t*)malloc(nd * sizeof(ssize_t));
958        if (start == NULL || step == NULL) {
959            PyErr_NoMemory();
960            %(fail)s
961        }
962
963        for (j = 0; j < nd; ++j) {
964          start[j] = 0;
965          step[j] = 1;
966        }
967        step[0] = 0;
968        num_indices = PyArray_SIZE(%(ind)s);
969        if (!%(params)s->inplace) {
970          %(out)s = theano_try_copy(%(out)s, %(x)s);
971          if (%(out)s == NULL) {
972            // Exception already set
973            %(fail)s
974            }
975        } else {
976          Py_XDECREF(%(out)s);
977          %(out)s = %(x)s;
978          Py_INCREF(%(out)s);
979        }
980        if (num_indices != 0) {
981          if ((num_indices - 1) > LONG_MAX) {
982            PyErr_Format(PyExc_AssertionError,
983                         "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
984            %(fail)s
985          }
986          broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
987          for (j = 0; j < num_indices; j++) {
988            start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
989            if (start[0] < 0)
990              start[0] += PyGpuArray_DIM(%(out)s, 0);
991            if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
992               PyErr_SetString(PyExc_IndexError, "index out of bounds");
993               %(fail)s;
994            }
995            row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
996            if (row_x == NULL)
997              %(fail)s;
998
999            if (broadcast_y)
1000              start[0] = 0;
1001            else
1002              start[0] = j;
1003
1004            row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
1005            if (row_y == NULL) {
1006              Py_DECREF(row_x);
1007              %(fail)s;
1008            }
1009
1010            if (%(params)s->set_instead_of_inc) {
1011              ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
1012            } else {
1013              void *args[2];
1014              args[0] = (void *)&row_x->ga;
1015              args[1] = (void *)&row_y->ga;
1016              ret = GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE);
1017            }
1018            Py_DECREF(row_x);
1019            Py_DECREF(row_y);
1020            if (ret != GA_NO_ERROR)
1021              PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
1022          }
1023        }
1024
1025        free(start);
1026        free(step);
1027        """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
1028                   params=sub['params'],
1029                   fail="""
1030                   {
1031                        free(start);
1032                        free(step);
1033                        %(fail)s
1034                   }
1035                   """ % dict(fail=sub['fail']))
1036
1037    def c_code_cache_version(self):
1038        return (5,)
1039
1040
1041class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
1042                                     GpuAdvancedIncSubtensor1):
1043    """
1044    Implement AdvancedIncSubtensor1 on the gpu with atomics
1045
1046    """
1047    _f16_ok = True
1048    params_type = GpuAdvancedIncSubtensor1.params_type
1049    get_params = GpuAdvancedIncSubtensor1.get_params
1050
1051    def make_node(self, x, y, ilist):
1052        """
1053        It differs from GpuAdvancedIncSubtensor1 in that it makes sure
1054        the indexes are of type long.
1055
1056        """
1057        ctx_name = infer_context_name(x, y, ilist)
1058        x_ = as_gpuarray_variable(x, ctx_name)
1059        y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name)
1060        ilist_ = as_gpuarray_variable(ilist, ctx_name)
1061
1062        assert x_.type.ndim >= y_.type.ndim
1063
1064        if ilist_.type.dtype not in tensor.integer_dtypes:
1065            raise TypeError('index must be integers')
1066        if ilist_.type.ndim != 1:
1067            raise TypeError('index must be vector')
1068        if x_.type.ndim == 0:
1069            raise TypeError('cannot index into a scalar')
1070        if y_.type.ndim > x_.type.ndim:
1071            if self.set_instead_of_inc:
1072                opname = 'set'
1073            else:
1074                opname = 'increment'
1075            raise TypeError(
1076                'cannot %s x subtensor with ndim=%s by y with ndim=%s ' % (
1077                    opname, x_.type.ndim, y_.type.ndim))
1078
1079        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
1080
1081    def perform(self, node, inp, out, params):
1082        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
1083
1084    def c_code_cache_version(self):
1085        return (14,)
1086
1087    def c_headers(self):
1088        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
1089                '<gpuarray/types.h>']
1090
1091    def c_header_dirs(self):
1092        return [gpuarray_helper_inc_dir()]
1093
1094    def c_code(self, node, name, inputs, outputs, sub):
1095        if (node.inputs[0].ndim != node.inputs[1].ndim or
1096                node.inputs[0].ndim != 2):
1097            raise NotImplementedError("This case does not have C code yet.")
1098
1099        return """
1100int err;
1101if (%(params)s->inplace) {
1102  Py_XDECREF(%(out)s);
1103  %(out)s = %(x)s;
1104  Py_INCREF(%(out)s);
1105} else {
1106  %(out)s = theano_try_copy(%(out)s, %(x)s);
1107}
1108if (!%(out)s) {
1109  // Exception already set
1110  %(fail)s
1111}
1112if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
1113  %(fail)s
1114}
1115        """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])
1116
1117    def gpu_kernels(self, node, nodename):
1118        # We can't rely on numpy for this, it changes with the OS
1119        CHARMAP = dict(int32='i', uint32='I',
1120                       int64='l', uint64='L',
1121                       float16='e', float32='f', float64='d')
1122        dtype_x = node.inputs[0].dtype
1123        dtype_y = node.inputs[1].dtype
1124        dtype_ind = node.inputs[2].dtype
1125        type_x = gpuarray.dtype_to_ctype(dtype_x)
1126        type_y = gpuarray.dtype_to_ctype(dtype_y)
1127        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
1128        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
1129        kname = "k_vector_add_fast"
1130        k_var = "k_vector_add_fast_" + nodename
1131        code = """#include "cluda.h"
1132        KERNEL void k_vector_add_fast(const ga_size numRowsX,
1133                                      const ga_size numColsX,
1134                                      const ga_ssize stridesX0,
1135                                      const ga_ssize stridesX1,
1136                                      GLOBAL_MEM %(type_x)s *X,
1137                                      const ga_size offset_X,
1138                                      const ga_size numRowsY,
1139                                      const ga_size numColsY,
1140                                      const ga_ssize stridesY0,
1141                                      const ga_ssize stridesY1,
1142                                      GLOBAL_MEM %(type_y)s *Y,
1143                                      const ga_size offset_Y,
1144                                      const ga_size numIndices,
1145                                      const ga_ssize stridesIndices,
1146                                      GLOBAL_MEM %(type_ind)s *indices_arr,
1147                                      const ga_size offset_indices_arr,
1148                                      const ga_int set_instead_of_inc,
1149                                      GLOBAL_MEM ga_int *err)
1150        {
1151             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
1152             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
1153             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
1154
1155             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
1156             {
1157                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
1158                  {
1159                      ga_ssize x_row = indices_arr[i * stridesIndices];
1160                      if (x_row < 0)
1161                          x_row += numRowsX;
1162                      ga_ssize y_row = i;
1163                      if (x_row < numRowsX && x_row >= 0) {
1164                        if (set_instead_of_inc) {
1165                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
1166                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
1167                        } else {
1168                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
1169                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
1170                        }
1171                      } else {
1172                        *err = 1;
1173                      }
1174                  }
1175             }
1176             return;
1177        }
1178        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
1179                   tc=CHARMAP[dtype_x])
1180        from pygpu.gpuarray import SIZE, SSIZE
1181        params = [
1182            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
1183            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
1184            SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
1185            gpuarray.GpuArray]
1186        return [Kernel(code=code, name=kname, params=params,
1187                       flags=flags, objvar=k_var)]
1188
1189    def c_support_code_struct(self, node, nodename):
1190        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
1191        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
1192                                     PyGpuArrayObject* py_other,
1193                                     PyGpuArrayObject* indices_arr,
1194                                     const int set_instead_of_inc)
1195        {
1196            size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
1197            size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
1198            gpudata *errbuf;
1199            int err, kerr = 0;
1200            size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
1201            size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
1202            size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
1203
1204            if (threads_per_block > 0 && n_blocks > 0) {
1205              err = gpudata_property(py_self->ga.data,
1206                                     GA_CTX_PROP_ERRBUF, &errbuf);
1207              if (err != GA_NO_ERROR) {
1208                PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
1209                return 1;
1210              }
1211
1212              err = k_vector_add_fast_call(
1213        1, &n_blocks, &threads_per_block, 0,
1214        PyGpuArray_DIMS(py_self)[0],
1215        PyGpuArray_DIMS(py_self)[1],
1216        PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
1217        PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
1218        py_self->ga.data,
1219        py_self->ga.offset,
1220        PyGpuArray_DIMS(py_other)[0],
1221        PyGpuArray_DIMS(py_other)[1],
1222        PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
1223        PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
1224        py_other->ga.data,
1225        py_other->ga.offset,
1226        PyGpuArray_DIMS(indices_arr)[0],
1227        PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
1228        indices_arr->ga.data,
1229        indices_arr->ga.offset,
1230        set_instead_of_inc,
1231        errbuf);
1232
1233              if (err != GA_NO_ERROR) {
1234                PyErr_Format(PyExc_RuntimeError,
1235                             "gpuarray error: %(k_var)s: %%s.",
1236                             GpuKernel_error(&%(k_var)s, err));
1237                return 1;
1238              }
1239              err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
1240              if (err != GA_NO_ERROR) {
1241                PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
1242                return 1;
1243              }
1244              if (kerr != 0) {
1245                PyErr_SetString(PyExc_IndexError, "Index out of bounds");
1246                kerr = 0;
1247                gpudata_write(errbuf, 0, &kerr, sizeof(int));
1248                return 1;
1249              }
1250            }
1251          return 0;
1252        }
1253        """ % dict(k_var="k_vector_add_fast_" + nodename)
1254
1255
1256class GpuExtractDiag(Op):
1257    __props__ = ("offset", "axis1", "axis2", "view")
1258    _f16_ok = True
1259
1260    def __init__(self, offset=0, axis1=0, axis2=1, view=False):
1261        self.view = view
1262        if self.view:
1263            self.view_map = {0: [0]}
1264        self.offset = offset
1265        self.axis1 = axis1
1266        self.axis2 = axis2
1267
1268    def make_node(self, _x):
1269        ctx_name = infer_context_name(_x)
1270        x = as_gpuarray_variable(_x, ctx_name)
1271
1272        if x.ndim < 2:
1273            raise ValueError('Diagonal needs an input with 2 or more '
1274                             'dimensions', x)
1275        axis_small, axis_large = sorted((self.axis1, self.axis2))
1276        broadcastable = x.broadcastable[:axis_small] + \
1277            x.broadcastable[axis_small + 1:axis_large] + \
1278            x.broadcastable[axis_large + 1:] + (False,)
1279        return gof.Apply(self, [x], [x.type.clone(broadcastable=broadcastable)()])
1280
1281    def perform(self, node, inputs, outputs):
1282        (x,) = inputs
1283        (z,) = outputs
1284        # zero-dimensional matrices ...
1285        if x.size == 0:
1286            out_shape = [d for i, d in enumerate(x.shape)
1287                         if i not in (self.axis1, self.axis2)]
1288            diag_size = np.min((x.shape[self.axis1], x.shape[self.axis2]))
1289            out_shape.append(diag_size)
1290            z[0] = node.outputs[0].type.value_zeros(tuple(out_shape))
1291            return
1292
1293        # step 1) slicing on axis1 and axis2.
1294        if self.offset >= 0:
1295            stride_axis, slice_axis = self.axis1, self.axis2
1296        else:
1297            slice_axis, stride_axis = self.axis1, self.axis2
1298
1299        small_axis, large_axis = sorted((x.shape[self.axis1],
1300                                         x.shape[self.axis2]))
1301
1302        if x.shape[stride_axis] < x.shape[slice_axis]:
1303            # in the bigger triangle
1304            numstride = small_axis - np.max((
1305                0, small_axis + np.abs(self.offset) - large_axis))
1306        else:
1307            # in the smaller triangle
1308            numstride = small_axis - np.abs(self.offset)
1309
1310        slicer = [np.s_[:], ] * x.ndim
1311        slicer[stride_axis] = np.s_[:numstride]
1312        slicer[slice_axis] = np.abs(self.offset)
1313        slicer = tuple(slicer)
1314
1315        # step 2) Swap stride_axis to the last dim because we want the dim on
1316        # which the diags extracted be listed as the last dim of the tensor.
1317        # This is also in consistence with the interface of numpy.diagonal.
1318        if slice_axis < stride_axis:
1319            stride_axis -= 1
1320        new_dim_order = list(range(x[slicer].ndim))
1321        new_dim_order = tuple(new_dim_order[:stride_axis] +
1322                              new_dim_order[stride_axis + 1:] +
1323                              [stride_axis, ])
1324        rval = x[slicer].transpose(new_dim_order)
1325
1326        # step 3) modify the strides in the last axis, such that rval becomes
1327        # a view on the diagonal.
1328        other_strides = tuple([d for i, d in enumerate(x.strides)
1329                               if i not in (self.axis1, self.axis2)])
1330        rval.strides = other_strides + \
1331            (x.strides[self.axis1] + x.strides[self.axis2], )
1332
1333        if self.view:
1334            z[0] = rval
1335        else:
1336            z[0] = rval.copy()
1337
1338    def grad(self, inputs, gout):
1339        (input_x,) = inputs
1340        return [grad_not_implemented(self, 0, input_x)]
1341
1342    def infer_shape(self, node, shapes):
1343        in_shape, = shapes
1344        dim1 = in_shape[self.axis1]
1345        dim2 = in_shape[self.axis2]
1346        out_shape = [d for i, d in enumerate(in_shape)
1347                     if i not in (self.axis1, self.axis2)]
1348        # The following logic is inspired by C code of PyArray_Diagonal().
1349        offset = self.offset
1350        if offset > 0:
1351            diag_size = T.clip(dim2 - offset, 0, dim1)
1352        elif offset < 0:
1353            diag_size = T.clip(dim1 + offset, 0, dim2)
1354        else:
1355            diag_size = T.minimum(dim1, dim2)
1356        out_shape.append(diag_size)
1357        return [tuple(out_shape)]
1358
1359
1360class GpuAllocDiag(AllocDiag):
1361    __props__ = ("offset", "axis1", "axis2")
1362
1363    def make_node(self, diag):
1364        ctx_name = infer_context_name(diag)
1365        diag = as_gpuarray_variable(diag, ctx_name)
1366        if diag.type.ndim < 1:
1367            raise ValueError('AllocDiag needs an input with 1 or more '
1368                             'dimensions', diag.type)
1369        return gof.Apply(
1370            self, [diag],
1371            [diag.type.__class__(
1372                dtype=diag.dtype,
1373                broadcastable=[False] * (diag.ndim + 1))()]
1374        )
1375
1376    def perform(self, node, inputs, outputs):
1377        (x,) = inputs
1378        (z,) = outputs
1379        axis1 = np.minimum(self.axis1, self.axis2)
1380        axis2 = np.maximum(self.axis1, self.axis2)
1381        offset = self.offset
1382
1383        # Initialise a buffer the same size as the output
1384        result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2
1385        result_buffer_shape = ((np.prod(x.shape[:-1]).astype(np.int64),) +
1386                               (x.shape[-1] + abs(offset),) * 2)
1387        result_buffer = gpuarray.zeros(result_buffer_shape,
1388                                       dtype=x.dtype,
1389                                       context=x.context)
1390
1391        # Slice out a view of the diagonals
1392        if offset < 0:  # diag in the lower triangle
1393            diag_view = result_buffer[:, abs(offset):, 0]
1394        else:  # diag in the upper triangle
1395            diag_view = result_buffer[:, :x.shape[-1], abs(offset)]
1396        diag_view.strides = (diag_view.strides[0],
1397                             diag_view.strides[1] + x.dtype.itemsize)
1398
1399        # Fill view with flattened array of diagonals
1400        diag_view[:] = x.reshape(diag_view.shape)[:]
1401
1402        # Unflatten buffer into output size
1403        result = result_buffer.reshape(result_shape)
1404
1405        if len(x.shape) > 1:
1406            # Re-order axes so they correspond to diagonals at axis1, axis2
1407            axes = list(range(len(x.shape[:-1])))
1408            last_idx = axes[-1]
1409            axes = axes[:axis1] + [last_idx + 1] + axes[axis1:]
1410            axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
1411            result = result.transpose(axes)
1412
1413        z[0] = result
1414
1415    def grad(self, inputs, gout):
1416        (gz,) = gout
1417        return [GpuExtractDiag(offset=self.offset, axis1=self.axis1, axis2=self.axis2)(gz)]
1418