1from __future__ import absolute_import, print_function, division 2 3import numpy as np 4from six import integer_types 5from six.moves import StringIO, xrange 6 7from theano import tensor, gof, Op 8from theano.gof import ParamsType 9from theano.gradient import grad_not_implemented 10import theano.tensor as T 11from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list 12from theano.tensor import AllocDiag 13from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t 14 15try: 16 import pygpu 17 from pygpu import gpuarray 18except ImportError: 19 pass 20 21from .type import GpuArrayType, gpu_context_type 22from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, gpuarray_helper_inc_dir, 23 infer_context_name, gpu_contiguous) 24 25iadd_reg = {} 26 27 28def get_iadd(a, b): 29 key = (a.type.dtype, b.type.dtype, a.type.context) 30 if key not in iadd_reg: 31 a_arg = pygpu.elemwise.arg('a', a.type.dtype, read=True, write=True) 32 b_arg = pygpu.elemwise.arg('b', b.type.dtype, read=True) 33 res = pygpu.elemwise.GpuElemwise(a.type.context, "a = a + b", [a_arg, b_arg], convert_f16=True) 34 iadd_reg[key] = res 35 return iadd_reg[key] 36 37 38class GpuSubtensor(HideC, Subtensor): 39 """ 40 Subtensor on the GPU. 41 """ 42 _f16_ok = True 43 44 def make_node(self, x, *inputs): 45 ctx_name = infer_context_name(x) 46 rval = tensor.Subtensor.make_node(self, x, *inputs) 47 otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, 48 broadcastable=rval.outputs[0].type.broadcastable, 49 context_name=ctx_name) 50 x = as_gpuarray_variable(x, ctx_name) 51 return gof.Apply(self, [x] + rval.inputs[1:], [otype()]) 52 53 def perform(self, node, inputs, out_): 54 out, = out_ 55 x = inputs[0] 56 57 cdata = get_idx_list(inputs, self.idx_list) 58 if len(cdata) == 1: 59 cdata = cdata[0] 60 61 out[0] = x.__getitem__(cdata) 62 63 def c_support_code(self): 64 return """ 65 static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step, 66 int start_n, int stop_n, int step_n, 67 size_t len) { 68 if (step_n) *step = 1; 69 if (*step == 0) { 70 PyErr_SetString(PyExc_ValueError, "slice step cannot be zero"); 71 return -1; 72 } 73 if (start_n) *start = (*step < 0) ? len-1 : 0; 74 else { 75 if (*start < 0) *start += len; 76 if (*start < 0) *start = (*step < 0) ? -1 : 0; 77 if (*start > -1 && *start >= len) { 78 *start = (*step < 0) ? len-1 : len; 79 } 80 } 81 82 if (stop_n) *stop = (*step < 0) ? -1 : len; 83 else { 84 if (*stop < 0) *stop += len; 85 if (*stop < 0) *stop = (*step < 0) ? -1 : 0; 86 if (*stop > -1 && *stop >= len) { 87 *stop = (*step < 0) ? len-1 : len; 88 } 89 } 90 if (*stop < *start && *step > 0) 91 *stop = *start; 92 return 0; 93 } 94 """ 95 96 def c_code(self, node, name, inputs, outputs, sub): 97 inp_ndim = node.inputs[0].ndim 98 inp = inputs[0] 99 indices = inputs[1:] 100 101 # pad out the index list to the same dimension as the input 102 idx_list = self.idx_list + \ 103 ((slice(None),) * (inp_ndim - len(self.idx_list))) 104 105 # This case fails when we use pygpu_index(), so here is some 106 # special code 107 if len(idx_list) == 0: 108 return """ 109 Py_XDECREF(%(out)s); 110 %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER); 111 if (!%(out)s) { 112 // Exception already set 113 %(fail)s 114 } 115""" % dict(out=outputs[0], inp=inp, fail=sub['fail']) 116 117 sio = StringIO() 118 print(""" 119 ssize_t starts[%(sz)s]; 120 ssize_t stops[%(sz)s]; 121 ssize_t steps[%(sz)s]; 122 ssize_t cur; 123 int err; 124 125 if (%(inp)s->ga.nd != %(sz)s) { 126 PyErr_SetString(PyExc_IndexError, "invalid index"); 127 %(fail)s 128 } 129 """ % dict(sz=len(idx_list), inp=inp, fail=sub['fail']), file=sio) 130 131 def fix_idx(idx): 132 if idx is None: 133 return "0", 1 134 elif isinstance(idx, (np.integer, integer_types)): 135 return str(idx), 0 136 elif isinstance(idx, gof.Type): 137 return indices.pop(0), 0 138 else: 139 assert 0, idx 140 141 for i, idx in enumerate(idx_list): 142 if isinstance(idx, slice): 143 start, start_n = fix_idx(idx.start) 144 stop, stop_n = fix_idx(idx.stop) 145 step, step_n = fix_idx(idx.step) 146 print(""" 147 starts[%(i)s] = %(start)s; 148 stops[%(i)s] = %(stop)s; 149 steps[%(i)s] = %(step)s; 150 if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s], 151 %(start_n)s, %(stop_n)s, %(step_n)s, 152 %(inp)s->ga.dimensions[%(i)s]) == -1) { 153 %(fail)s 154 } 155 """ % dict(i=i, start=start, stop=stop, step=step, 156 start_n=start_n, stop_n=stop_n, step_n=step_n, 157 fail=sub['fail'], inp=inp), file=sio) 158 else: 159 if isinstance(idx, gof.Type): 160 start = indices.pop(0) 161 elif isinstance(idx, (np.integer, integer_types)): 162 start = idx 163 else: 164 assert 0, idx 165 print(""" 166 cur = %(start)s; 167 if (cur < 0) 168 cur += %(inp)s->ga.dimensions[%(i)s]; 169 starts[%(i)s] = cur; 170 steps[%(i)s] = 0; 171 """ % dict(i=i, start=start, fail=sub['fail'], inp=inp), file=sio) 172 173 print(""" 174 Py_XDECREF(%(out)s); 175 %(out)s = pygpu_index(%(inp)s, starts, stops, steps); 176 if (!%(out)s) { %(fail)s } 177""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0]), file=sio) 178 179 return sio.getvalue() 180 181 def c_code_cache_version(self): 182 return (8,) 183 184 185class GpuIncSubtensor(IncSubtensor): 186 """ 187 Implement IncSubtensor on the gpu. 188 189 Notes 190 ----- 191 The optimization to make this inplace is in tensor/opt. 192 The same optimization handles IncSubtensor and GpuIncSubtensor. 193 This Op has c_code too; it inherits tensor.IncSubtensor's c_code. 194 The helper methods like :meth:`do_type_checking`, 195 :meth:`copy_of_x`, etc. specialize the c_code for this Op. 196 197 """ 198 _f16_ok = True 199 params_type = gpu_context_type 200 201 def make_node(self, x, y, *inputs): 202 ctx_name = infer_context_name(x, y) 203 x = as_gpuarray_variable(x, ctx_name) 204 y = as_gpuarray_variable(y, ctx_name) 205 rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) 206 ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()]) 207 return ret 208 209 def get_params(self, node): 210 return node.outputs[0].type.context 211 212 def perform(self, node, inputs, out_, ctx): 213 out, = out_ 214 x, y = inputs[:2] 215 indices = list(reversed(inputs[2:])) 216 217 def convert(entry): 218 if isinstance(entry, gof.Type): 219 rval = indices.pop() 220 return rval 221 elif isinstance(entry, slice): 222 return slice(convert(entry.start), 223 convert(entry.stop), 224 convert(entry.step)) 225 else: 226 return entry 227 228 cdata = tuple(map(convert, self.idx_list)) 229 if len(cdata) == 1: 230 cdata = cdata[0] 231 if not self.inplace: 232 x = x.copy() 233 sub_x = x.__getitem__(cdata) 234 if sub_x.shape: 235 # we've sliced out an N-D tensor with N > 0 236 if not self.set_instead_of_inc: 237 # sub_x += y 238 iadd = get_iadd(node.inputs[0], node.inputs[1]) 239 iadd(sub_x, y) 240 else: 241 # sub_x[...] = y 242 x.__setitem__(cdata, y) 243 else: 244 # scalar case 245 if not self.set_instead_of_inc: 246 # x.__setitem__(cdata, sub_x + y) 247 tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x, 248 broadcast=False) 249 x.__setitem__(cdata, tmp) 250 else: 251 x.__setitem__(cdata, y) 252 out[0] = x 253 254 def do_type_checking(self, node): 255 """ 256 Should raise NotImplementedError if c_code does not support 257 the types involved in this node. 258 259 """ 260 261 if not isinstance(node.inputs[0].type, GpuArrayType): 262 raise NotImplementedError() 263 264 def copy_of_x(self, x): 265 """ 266 267 Parameters 268 ---------- 269 x 270 A string giving the name of a C variable pointing to an array. 271 272 Returns 273 ------- 274 str 275 C code expression to make a copy of x. 276 277 Notes 278 ----- 279 Base class uses `PyArrayObject *`, subclasses may override for 280 different types of arrays. 281 282 """ 283 return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals() 284 285 def decl_view(self): 286 return "PyGpuArrayObject* zview = NULL;" 287 288 def make_view_array(self, x, view_ndim): 289 """ 290 //TODO 291 292 Parameters 293 ---------- 294 x 295 A string identifying an array to be viewed. 296 view_ndim 297 A string specifying the number of dimensions to have in the view. 298 This doesn't need to actually set up the view with the 299 right indexing; we'll do that manually later. 300 301 """ 302 ret = """ 303 size_t dims[%(view_ndim)s]; 304 for(int i=0; i<%(view_ndim)s; i++) 305 dims[i] = xview_dims[i]; 306 307 zview = pygpu_fromgpudata(%(x)s->ga.data, 308 %(x)s->ga.offset + xview_offset, 309 %(x)s->ga.typecode, 310 %(view_ndim)s, 311 dims, 312 xview_strides, 313 %(x)s->context, 314 1, 315 (PyObject *)%(x)s, 316 (PyObject *)&PyGpuArrayType); 317 """ % locals() 318 return ret 319 320 def get_helper_c_code_args(self): 321 """ 322 Return a dictionary of arguments to use with helper_c_code. 323 324 """ 325 return {'c_prefix': 'PyGpuArray', 326 'strides_mul': 1 327 } 328 329 def copy_into(self, view, source): 330 """ 331 332 Parameters 333 ---------- 334 view : string 335 C code expression for an array. 336 source : string 337 C code expression for an array. 338 339 Returns 340 ------- 341 str 342 C code expression to copy source into view, and 0 on success. 343 344 """ 345 return """sub_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals() 346 347 def c_headers(self): 348 return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>', 349 '<gpuarray/elemwise.h>'] 350 351 def c_support_code(self): 352 return """ 353int sub_setarray(GpuArray *dst, GpuArray *src) { 354 int err; 355 err = GpuArray_setarray(dst, src); 356 if (err != GA_NO_ERROR) 357 PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err)); 358 return err; 359} 360""" 361 362 def c_support_code_struct(self, node, nodename): 363 return "\nGpuElemwise *iadd;\n" 364 365 def c_init_code_struct(self, node, name, sub): 366 return """ 367 gpuelemwise_arg args[2] = {{0}}; 368 args[0].name = "a"; 369 args[0].typecode = %(type1)s; 370 args[0].flags = GE_READ|GE_WRITE; 371 args[1].name = "b"; 372 args[1].typecode = %(type2)s; 373 args[1].flags = GE_READ; 374 iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b", 375 2, args, %(nd)s, GE_CONVERT_F16); 376 if (iadd == NULL) { 377 PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); 378 %(fail)s 379 } 380 """ % dict(ctx=sub['params'], fail=sub['fail'], 381 type1=node.inputs[0].type.typecode, 382 type2=node.inputs[1].type.typecode, 383 nd=node.inputs[1].ndim) 384 385 def add_to_zview(self, nodename, x, fail): 386 return """ 387 { 388 void *args[2]; 389 args[0] = &zview->ga; 390 args[1] = &%(x)s->ga; 391 if (GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE) != GA_NO_ERROR) { 392 PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add"); 393 Py_DECREF(zview); 394 %(fail)s 395 } 396 } 397 """ % locals() 398 399 def c_code_cache_version(self): 400 parent_version = super(GpuIncSubtensor, self).c_code_cache_version() 401 if not parent_version: 402 return 403 return parent_version + (10,) 404 405 406class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): 407 """ 408 AdvancedSubrensor1 on the GPU. 409 """ 410 _f16_ok = True 411 412 def make_node(self, x, ilist): 413 ctx_name = infer_context_name(x, ilist) 414 x_ = as_gpuarray_variable(x, ctx_name) 415 416 ilist__ = tensor.as_tensor_variable(ilist) 417 if ilist__.type.dtype not in tensor.integer_dtypes: 418 raise TypeError('index must be integers') 419 if ilist__.type.dtype != 'int64': 420 ilist__ = tensor.cast(ilist__, 'int64') 421 422 ilist_ = gpu_contiguous(as_gpuarray_variable(ilist__, ctx_name)) 423 424 if ilist_.type.dtype != 'int64': 425 raise TypeError('index must be int64') 426 if ilist_.type.ndim != 1: 427 raise TypeError('index must be a vector') 428 if x_.type.ndim == 0: 429 raise TypeError('cannot index into a scalar') 430 431 bcast = ilist_.broadcastable + x_.broadcastable[1:] 432 return gof.Apply(self, [x_, ilist_], 433 [GpuArrayType(dtype=x.dtype, 434 context_name=ctx_name, 435 broadcastable=bcast)()]) 436 437 def perform(self, node, inp, out_): 438 raise NotImplementedError() 439 440 def c_support_code(self): 441 return """ 442int take1_match_dims(GpuArray *a, GpuArray *v) { 443 if (a->nd != v->nd) return 0; 444 for (unsigned int i = 1; i < v->nd; i++) { 445 if (a->dimensions[i] != v->dimensions[i]) return 0; 446 } 447 return 1; 448} 449""" 450 451 def c_code(self, node, name, inputs, outputs, sub): 452 return """ 453int err; 454if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) || 455 %(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] || 456 !take1_match_dims(&%(out)s->ga, &%(v)s->ga)) { 457 size_t tmp; 458 Py_XDECREF(%(out)s); 459 460 /* This is a dirty hack to avoid an extra alloc */ 461 tmp = %(v)s->ga.dimensions[0]; 462 %(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0]; 463 %(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode, 464 GA_C_ORDER, %(v)s->context, Py_None); 465 if (%(out)s == NULL) { 466 %(fail)s; 467 } 468 %(v)s->ga.dimensions[0] = tmp; // Don't remove this line 469} 470 471err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1); 472if (err != GA_NO_ERROR) { 473 if (err == GA_VALUE_ERROR) { 474 PyErr_SetString(PyExc_IndexError, "Index out of bounds."); 475 } else { 476 PyErr_SetString(PyExc_RuntimeError, GpuArray_error(&%(v)s->ga, err)); 477 } 478 %(fail)s 479} 480""" % dict(out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub['fail']) 481 482 def c_code_cache_version(self): 483 return (1,) 484 485 486def check_and_convert_boolean_masks(input, idx_list): 487 """ 488 This function checks if the boolean mask arrays in the index have 489 the right shape and converts them to index arrays by calling nonzero. 490 For each boolean mask, we check if the mask has the 491 same shape as the input. This is enforced in NumPy 0.13.0 and 492 newer, but not by earlier versions. If the size is not the same, 493 this method raises an IndexError. 494 """ 495 dim_seen = 0 496 out_idx_list = [] 497 for index in idx_list: 498 if index is np.newaxis: 499 # skip, does not count as an input dimension 500 out_idx_list.append(index) 501 elif isinstance(index, np.ndarray) and index.dtype == 'bool': 502 for i in xrange(index.ndim): 503 if index.shape[i] != input.shape[dim_seen + i]: 504 raise IndexError('boolean index did not match indexed array ' 505 'along dimension %d; dimension is %d but ' 506 'corresponding boolean dimension is %d' % 507 (dim_seen + i, input.shape[dim_seen + i], 508 index.shape[i])) 509 dim_seen += index.ndim 510 out_idx_list += index.nonzero() 511 else: 512 dim_seen += 1 513 out_idx_list.append(index) 514 return out_idx_list 515 516 517class BaseGpuAdvancedSubtensor(object): 518 def perform(self, node, inputs, out_): 519 out, = out_ 520 x = inputs[0] 521 idx = inputs[1:] 522 523 # convert boolean masks to index arrays 524 idx = check_and_convert_boolean_masks(x, idx) 525 526 # detect and transpose array indices 527 nidx = [] 528 nshp = list(x.shape) 529 for k, i in enumerate(idx): 530 if i is None: 531 nidx.append(slice(None)) 532 nshp.insert(k, 1) 533 else: 534 nidx.append(i) 535 536 x = x.reshape(nshp) 537 538 transp = list(range(x.ndim)) 539 # number of array-indexed dimensions 540 p = 0 541 # ap represents the axis in the resulting array where the 542 # dimensions indexed by arrays and ints will be inserted. 543 # For instance, if all such dimensions are grouped together, 544 # it corresponds to the index of the first such dimension in the 545 # initial array. If these dimensions are split (with slices 546 # between), then the resulting dimensions will be moved to the 547 # beginning, and ap will be 0. 548 # If no such dimension has been encountered, ap is None. 549 ap = None 550 # Indicates whether we have already encountered an index (array 551 # or number), and then a slice. 552 slice_after_idx = False 553 for k, i in enumerate(list(nidx)): 554 if (isinstance(i, np.ndarray) and i.ndim != 0): 555 transp.remove(k) 556 transp.insert(p, k) 557 i = nidx.pop(k) 558 nidx.insert(p, i) 559 p += 1 560 if ap is None: 561 # first non-slice index 562 ap = k 563 elif slice_after_idx: 564 # We already encountered at least an array or int, and then 565 # a slice. Array-indexed axes are not grouped, 566 # moving to the beginning 567 ap = 0 568 else: 569 try: 570 i.__index__() 571 if ap is None: 572 ap = k 573 # indices do not break the contiguity of 574 # array-indexed axes 575 except Exception: 576 # If we already encountered an array/int index, it 577 # means future ones will not be grouped. 578 if ap is not None: 579 slice_after_idx = True 580 581 x = x.transpose(*transp) 582 583 idx_ = ([slice(None)] * p + nidx[p:]) 584 x = x.__getitem__(idx_) 585 586 if p == 0: 587 assert ap is None 588 # The only indexing was through slices and indices. 589 # This can happen with symbolic slices for instance. 590 # Since no view_map is set, we need to copy the returned value 591 out[0] = x.copy() 592 return 593 594 # At this point, we should have encountered at least one array 595 assert ap is not None 596 597 # flatten the array-indexed dimensions 598 shape = ((np.prod(x.shape[0: p]),) + 599 x.shape[p:]) 600 input_flat = x.reshape(shape) 601 602 # build the strides 603 strides = [1] 604 for i in range(p - 1, 0, -1): 605 stride = x.shape[i] * strides[0] 606 strides.insert(0, stride) 607 608 # build the indices and use it 609 take_idx = sum((i * s for i, s in zip(nidx, strides))) 610 out_flat = input_flat.take1(pygpu.asarray(take_idx.flatten(), 611 context=x.context)) 612 613 # finish up 614 out_flat_shp = take_idx.shape + x.shape[p:] 615 o = out_flat.reshape(out_flat_shp) 616 617 if ap != 0: 618 # Put the resulting indexing at the place that NumPy 619 # decided was the right one. 620 ntransp = list(range(take_idx.ndim, o.ndim)) 621 ntransp[ap:ap] = list(range(take_idx.ndim)) 622 o = o.transpose(*ntransp) 623 624 out[0] = o 625 626 627class GpuAdvancedSubtensor(HideC, BaseGpuAdvancedSubtensor, tensor.AdvancedSubtensor): 628 """ 629 AdvancedSubtensor on the GPU. 630 """ 631 def make_node(self, x, *inputs): 632 ctx_name = infer_context_name(x) 633 # This method relies on AdvancedSubtensor.make_node to 634 # call tensor.subtensor.check_and_reject_bool(inputs), 635 # which raises an IndexError if there are any boolean indices. 636 rval = tensor.AdvancedSubtensor.make_node(self, x, *inputs) 637 otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, 638 broadcastable=rval.outputs[0].type.broadcastable, 639 context_name=ctx_name) 640 x = as_gpuarray_variable(x, ctx_name) 641 return gof.Apply(self, [x] + rval.inputs[1:], [otype()]) 642 643 644class GpuAdvancedBooleanSubtensor(HideC, BaseGpuAdvancedSubtensor, tensor.AdvancedBooleanSubtensor): 645 """ 646 AdvancedBooleanSubtensor on the GPU. 647 """ 648 def make_node(self, x, *inputs): 649 ctx_name = infer_context_name(x) 650 rval = tensor.AdvancedBooleanSubtensor.make_node(self, x, *inputs) 651 otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, 652 broadcastable=rval.outputs[0].type.broadcastable, 653 context_name=ctx_name) 654 x = as_gpuarray_variable(x, ctx_name) 655 return gof.Apply(self, [x] + rval.inputs[1:], [otype()]) 656 657 658class BaseGpuAdvancedIncSubtensor(object): 659 def perform(self, node, inp, out_): 660 out, = out_ 661 x = inp[0] 662 y = inp[1] 663 idx = inp[2:] 664 x = x.copy() 665 # Get a handle to the GpuElemwise object that will be called. 666 # It is not necessary to have the right number of dimensions, 667 # so we just pass symbolic x and y. 668 iadd = get_iadd(node.inputs[0], node.inputs[1]) 669 670 # convert all indices to np.array 671 for i in range(len(idx)): 672 if isinstance(idx[i], gpuarray.GpuArray): 673 idx[i] = np.asarray(idx[i]) 674 675 # convert boolean masks to index arrays 676 idx = check_and_convert_boolean_masks(x, idx) 677 678 # Insert axes for None indexing 679 nidx = [] 680 nshp = list(x.shape) 681 for k, i in enumerate(idx): 682 if i is None: 683 nidx.append(slice(None)) 684 nshp.insert(k, 1) 685 else: 686 nidx.append(i) 687 688 x_ = x.reshape(nshp) 689 690 # Bring array indices to front 691 transp = [] 692 nidx_ = [] 693 p = 0 694 for k, i in enumerate(list(nidx)): 695 if isinstance(i, np.ndarray) and i.ndim != 0: 696 transp.append(k) 697 nidx_.append(i) 698 p += 1 699 for k, i in enumerate(list(nidx)): 700 if not (isinstance(i, np.ndarray) and i.ndim != 0): 701 transp.append(k) 702 nidx_.append(i) 703 transp = transp + list(range(len(transp), x_.ndim)) 704 rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x:x[1])] 705 nidx = nidx_ 706 707 # transp: order to shuffle axes of x so that single dimension 708 # subarrays are extracted first 709 # p: number of axes with array indexing 710 x_ = x_.transpose(*transp) 711 idx_ = ([slice(None)] * p + nidx[p:]) 712 # flatten the array-indexed dimensions 713 x_flat = x_.reshape((np.prod(x_.shape[0: p]),) + x_.shape[p:]) 714 # process y so that last axes are the same 715 if y.shape != (1,): 716 y_shape_reverse = [] 717 for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]): 718 if x_s == y_s or y_s == 1: 719 y_shape_reverse.append(y_s) 720 else: 721 break 722 if np.prod(y_shape_reverse) < np.prod(y.shape): 723 if len(y_shape_reverse) > 0: 724 y_shape_reverse.append( 725 int(np.prod(y.shape[0:-len(y_shape_reverse)]))) 726 else: 727 y_shape_reverse.append(int(np.prod(y.shape))) 728 729 y_shape = y_shape_reverse[::-1] 730 y_flat = y.reshape(y_shape) 731 else: 732 y_flat = y[0] 733 734 # build the strides 735 strides = [1] 736 for i in range(p - 1, 0, -1): 737 stride = x_.shape[i] * strides[0] 738 strides.insert(0, stride) 739 740 # build the indices and use it 741 index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1) 742 take_idx = sum(i * s for i, s in zip(nidx, strides)) 743 if index == []: 744 for j, i in enumerate(take_idx.flatten()): 745 if y_flat.shape == (): 746 val = y_flat 747 else: 748 val = y_flat[j] 749 750 iadd(x_flat[i], val, broadcast=True) 751 else: 752 if (x_flat.shape[-len(y_flat.shape):] == y_flat.shape or 753 y_flat.shape == ()): 754 # y_flat has to be broadcast over axes of x_flat[i] 755 756 for i in take_idx.flatten(): 757 if len(idx_[p:]) > 0: 758 x_flat_sub = x_flat[i].__getitem__(index) 759 else: 760 x_flat_sub = x_flat[i] 761 iadd(x_flat_sub, y_flat, broadcast=True) 762 else: 763 # y_flat's first axis corresponds to first exist of x_flat 764 for j, i in enumerate(take_idx.flatten()): 765 if len(idx_[p:]) > 0: 766 x_flat_sub = x_flat[i].__getitem__(index) 767 else: 768 x_flat_sub = x_flat[i] 769 iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True) 770 x_ = x_flat.reshape(x_.shape).transpose(*rtransp) 771 out[0] = x_ 772 773 774class GpuAdvancedIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, tensor.AdvancedIncSubtensor): 775 """ 776 Implement AdvancedIncSubtensor on the gpu. 777 778 """ 779 def make_node(self, x, y, *inputs): 780 ctx_name = infer_context_name(x, y) 781 rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs) 782 otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, 783 broadcastable=rval.outputs[0].type.broadcastable, 784 context_name=ctx_name) 785 x = as_gpuarray_variable(x, ctx_name) 786 y = as_gpuarray_variable(y, ctx_name) 787 return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()]) 788 789 790class GpuAdvancedBooleanIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, tensor.AdvancedBooleanIncSubtensor): 791 """ 792 Implement AdvancedBooleanIncSubtensor on the gpu. 793 794 """ 795 def make_node(self, x, y, *inputs): 796 ctx_name = infer_context_name(x, y) 797 rval = tensor.AdvancedBooleanIncSubtensor.make_node(self, x, y, *inputs) 798 otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, 799 broadcastable=rval.outputs[0].type.broadcastable, 800 context_name=ctx_name) 801 x = as_gpuarray_variable(x, ctx_name) 802 y = as_gpuarray_variable(y, ctx_name) 803 return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()]) 804 805 806class GpuAdvancedIncSubtensor1(Op): 807 """ 808 Implement AdvancedIncSubtensor1 on the gpu. 809 810 """ 811 _f16_ok = True 812 __props__ = ('inplace', 'set_instead_of_inc') 813 params_type = ParamsType(inplace=bool_t, 814 set_instead_of_inc=bool_t, 815 context=gpu_context_type, 816 # following params are used into c_init_code_struct(), 817 # as inputs are not available in that function. 818 ndim_input_0=size_t, 819 ndim_input_1=size_t, 820 typecode_input_0=int_t, 821 typecode_input_1=int_t) 822 823 def __init__(self, inplace=False, set_instead_of_inc=False): 824 self.inplace = inplace 825 self.set_instead_of_inc = set_instead_of_inc 826 if inplace: 827 self.destroy_map = {0: [0]} 828 829 def clone_inplace(self): 830 return self.__class__( 831 inplace=True, 832 set_instead_of_inc=self.set_instead_of_inc) 833 834 def make_node(self, x, y, ilist): 835 ctx_name = infer_context_name(x, y) 836 x_ = as_gpuarray_variable(x, ctx_name) 837 y_ = as_gpuarray_variable(y, ctx_name) 838 ilist_ = tensor.as_tensor_variable(ilist) 839 840 assert x_.type.ndim >= y_.type.ndim 841 842 if ilist_.type.dtype not in tensor.integer_dtypes: 843 raise TypeError('index must be integers') 844 if ilist_.type.ndim != 1: 845 raise TypeError('index must be vector') 846 if x_.type.ndim == 0: 847 raise TypeError('cannot index into a scalar') 848 if y_.type.ndim > x_.type.ndim: 849 if self.set_instead_of_inc: 850 opname = 'set' 851 else: 852 opname = 'increment' 853 raise TypeError( 854 'cannot %s x subtensor with ndim=%s by y with ndim=%s ' % ( 855 opname, x_.type.ndim, y_.type.ndim)) 856 857 return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) 858 859 def get_params(self, node): 860 return self.params_type.get_params(self, context=node.outputs[0].type.context, 861 # following params are used into c_init_code_struct(). 862 ndim_input_0=node.inputs[0].ndim, 863 ndim_input_1=node.inputs[1].ndim, 864 typecode_input_0=node.inputs[0].type.typecode, 865 typecode_input_1=node.inputs[1].type.typecode) 866 867 # We can't use the parent version that loops on each index 868 # as we also need to loop when set_instead_of_inc is True and the 869 # parent doesn't loop in that case. 870 def perform(self, node, inp, out_, params=None): 871 # TODO opt to make this inplace 872 x, y, idx = inp 873 out, = out_ 874 875 if not self.inplace: 876 x = x.copy() 877 878 out[0] = x 879 880 if len(idx) == 0: 881 return 882 883 # Make sure idx is not a GpuArray otherwise we cannot use its 884 # content to index x and y (This is because we serve as 885 # fallback for _dev20). 886 if isinstance(idx, gpuarray.GpuArray): 887 idx = np.asarray(idx) 888 889 # If `y` has as many dimensions as `x`, then we want to iterate 890 # jointly on `x` and `y`. Otherwise, it means `y` should be 891 # broadcasted to fill all relevant rows of `x`. 892 if y.ndim == x.ndim and y.shape[0] != 1: 893 assert len(y) == len(idx) 894 if self.set_instead_of_inc: 895 for (j, i) in enumerate(idx): 896 x[i] = y[j] 897 else: 898 k = get_iadd(node.inputs[0], node.inputs[1]) 899 for (j, i) in enumerate(idx): 900 k(x[i], y[j], broadcast=True) 901 else: 902 if y.ndim == x.ndim: 903 # First dim is always 1 in this case. 904 reshaped_y = y.reshape(y.shape[1:]) 905 else: 906 nb_dims_to_add = (x.ndim - 1) - y.ndim 907 reshaped_y = y.reshape((1,) * nb_dims_to_add + y.shape) 908 909 if self.set_instead_of_inc: 910 for i in idx: 911 x[i] = reshaped_y 912 else: 913 k = get_iadd(node.inputs[0], node.inputs[1]) 914 for i in idx: 915 k(x[i], reshaped_y, broadcast=True) 916 917 def c_headers(self): 918 return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>', 919 '<gpuarray/elemwise.h>', 'gpuarray_helper.h'] 920 921 def c_header_dirs(self): 922 return [gpuarray_helper_inc_dir()] 923 924 def c_support_code_struct(self, node, nodename): 925 return "\nGpuElemwise *iadd;\n" 926 927 def c_init_code_struct(self, node, name, sub): 928 return """ 929 gpuelemwise_arg args[2] = {{0}}; 930 args[0].name = "a"; 931 args[0].typecode = %(params)s->typecode_input_0; 932 args[0].flags = GE_READ|GE_WRITE; 933 args[1].name = "b"; 934 args[1].typecode = %(params)s->typecode_input_1; 935 args[1].flags = GE_READ; 936 iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b", 937 2, args, %(params)s->ndim_input_1, GE_CONVERT_F16); 938 if (iadd == NULL) { 939 PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); 940 %(fail)s 941 } 942 """ % dict(params=sub['params'], fail=sub['fail']) 943 944 def c_code(self, node, name, inputs, outputs, sub): 945 if (node.inputs[0].ndim != node.inputs[1].ndim): 946 raise NotImplementedError("This case does not have C code yet.") 947 948 return """ 949 PyGpuArrayObject *row_x, *row_y; 950 size_t nd = %(params)s->ndim_input_0; 951 ssize_t *start = NULL, *step = NULL; 952 size_t num_indices, j; 953 int ret; 954 int broadcast_y; 955 956 start = (ssize_t*)malloc(nd * sizeof(ssize_t)); 957 step = (ssize_t*)malloc(nd * sizeof(ssize_t)); 958 if (start == NULL || step == NULL) { 959 PyErr_NoMemory(); 960 %(fail)s 961 } 962 963 for (j = 0; j < nd; ++j) { 964 start[j] = 0; 965 step[j] = 1; 966 } 967 step[0] = 0; 968 num_indices = PyArray_SIZE(%(ind)s); 969 if (!%(params)s->inplace) { 970 %(out)s = theano_try_copy(%(out)s, %(x)s); 971 if (%(out)s == NULL) { 972 // Exception already set 973 %(fail)s 974 } 975 } else { 976 Py_XDECREF(%(out)s); 977 %(out)s = %(x)s; 978 Py_INCREF(%(out)s); 979 } 980 if (num_indices != 0) { 981 if ((num_indices - 1) > LONG_MAX) { 982 PyErr_Format(PyExc_AssertionError, 983 "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices); 984 %(fail)s 985 } 986 broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1; 987 for (j = 0; j < num_indices; j++) { 988 start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j); 989 if (start[0] < 0) 990 start[0] += PyGpuArray_DIM(%(out)s, 0); 991 if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) { 992 PyErr_SetString(PyExc_IndexError, "index out of bounds"); 993 %(fail)s; 994 } 995 row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step); 996 if (row_x == NULL) 997 %(fail)s; 998 999 if (broadcast_y) 1000 start[0] = 0; 1001 else 1002 start[0] = j; 1003 1004 row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step); 1005 if (row_y == NULL) { 1006 Py_DECREF(row_x); 1007 %(fail)s; 1008 } 1009 1010 if (%(params)s->set_instead_of_inc) { 1011 ret = GpuArray_setarray(&row_x->ga, &row_y->ga); 1012 } else { 1013 void *args[2]; 1014 args[0] = (void *)&row_x->ga; 1015 args[1] = (void *)&row_y->ga; 1016 ret = GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE); 1017 } 1018 Py_DECREF(row_x); 1019 Py_DECREF(row_y); 1020 if (ret != GA_NO_ERROR) 1021 PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements"); 1022 } 1023 } 1024 1025 free(start); 1026 free(step); 1027 """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], 1028 params=sub['params'], 1029 fail=""" 1030 { 1031 free(start); 1032 free(step); 1033 %(fail)s 1034 } 1035 """ % dict(fail=sub['fail'])) 1036 1037 def c_code_cache_version(self): 1038 return (5,) 1039 1040 1041class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, 1042 GpuAdvancedIncSubtensor1): 1043 """ 1044 Implement AdvancedIncSubtensor1 on the gpu with atomics 1045 1046 """ 1047 _f16_ok = True 1048 params_type = GpuAdvancedIncSubtensor1.params_type 1049 get_params = GpuAdvancedIncSubtensor1.get_params 1050 1051 def make_node(self, x, y, ilist): 1052 """ 1053 It differs from GpuAdvancedIncSubtensor1 in that it makes sure 1054 the indexes are of type long. 1055 1056 """ 1057 ctx_name = infer_context_name(x, y, ilist) 1058 x_ = as_gpuarray_variable(x, ctx_name) 1059 y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name) 1060 ilist_ = as_gpuarray_variable(ilist, ctx_name) 1061 1062 assert x_.type.ndim >= y_.type.ndim 1063 1064 if ilist_.type.dtype not in tensor.integer_dtypes: 1065 raise TypeError('index must be integers') 1066 if ilist_.type.ndim != 1: 1067 raise TypeError('index must be vector') 1068 if x_.type.ndim == 0: 1069 raise TypeError('cannot index into a scalar') 1070 if y_.type.ndim > x_.type.ndim: 1071 if self.set_instead_of_inc: 1072 opname = 'set' 1073 else: 1074 opname = 'increment' 1075 raise TypeError( 1076 'cannot %s x subtensor with ndim=%s by y with ndim=%s ' % ( 1077 opname, x_.type.ndim, y_.type.ndim)) 1078 1079 return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) 1080 1081 def perform(self, node, inp, out, params): 1082 return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out) 1083 1084 def c_code_cache_version(self): 1085 return (14,) 1086 1087 def c_headers(self): 1088 return ['<numpy_compat.h>', '<gpuarray_helper.h>', 1089 '<gpuarray/types.h>'] 1090 1091 def c_header_dirs(self): 1092 return [gpuarray_helper_inc_dir()] 1093 1094 def c_code(self, node, name, inputs, outputs, sub): 1095 if (node.inputs[0].ndim != node.inputs[1].ndim or 1096 node.inputs[0].ndim != 2): 1097 raise NotImplementedError("This case does not have C code yet.") 1098 1099 return """ 1100int err; 1101if (%(params)s->inplace) { 1102 Py_XDECREF(%(out)s); 1103 %(out)s = %(x)s; 1104 Py_INCREF(%(out)s); 1105} else { 1106 %(out)s = theano_try_copy(%(out)s, %(x)s); 1107} 1108if (!%(out)s) { 1109 // Exception already set 1110 %(fail)s 1111} 1112if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) { 1113 %(fail)s 1114} 1115 """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params']) 1116 1117 def gpu_kernels(self, node, nodename): 1118 # We can't rely on numpy for this, it changes with the OS 1119 CHARMAP = dict(int32='i', uint32='I', 1120 int64='l', uint64='L', 1121 float16='e', float32='f', float64='d') 1122 dtype_x = node.inputs[0].dtype 1123 dtype_y = node.inputs[1].dtype 1124 dtype_ind = node.inputs[2].dtype 1125 type_x = gpuarray.dtype_to_ctype(dtype_x) 1126 type_y = gpuarray.dtype_to_ctype(dtype_y) 1127 type_ind = gpuarray.dtype_to_ctype(dtype_ind) 1128 flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) 1129 kname = "k_vector_add_fast" 1130 k_var = "k_vector_add_fast_" + nodename 1131 code = """#include "cluda.h" 1132 KERNEL void k_vector_add_fast(const ga_size numRowsX, 1133 const ga_size numColsX, 1134 const ga_ssize stridesX0, 1135 const ga_ssize stridesX1, 1136 GLOBAL_MEM %(type_x)s *X, 1137 const ga_size offset_X, 1138 const ga_size numRowsY, 1139 const ga_size numColsY, 1140 const ga_ssize stridesY0, 1141 const ga_ssize stridesY1, 1142 GLOBAL_MEM %(type_y)s *Y, 1143 const ga_size offset_Y, 1144 const ga_size numIndices, 1145 const ga_ssize stridesIndices, 1146 GLOBAL_MEM %(type_ind)s *indices_arr, 1147 const ga_size offset_indices_arr, 1148 const ga_int set_instead_of_inc, 1149 GLOBAL_MEM ga_int *err) 1150 { 1151 X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X); 1152 Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y); 1153 indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); 1154 1155 for (ga_int i = GID_0; i < numIndices; i += GDIM_0) 1156 { 1157 for (ga_int j = LID_0; j < numColsX; j += LDIM_0) 1158 { 1159 ga_ssize x_row = indices_arr[i * stridesIndices]; 1160 if (x_row < 0) 1161 x_row += numRowsX; 1162 ga_ssize y_row = i; 1163 if (x_row < numRowsX && x_row >= 0) { 1164 if (set_instead_of_inc) { 1165 atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], 1166 Y[(y_row * stridesY0) + (j * stridesY1)]); 1167 } else { 1168 atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], 1169 Y[(y_row * stridesY0) + (j * stridesY1)]); 1170 } 1171 } else { 1172 *err = 1; 1173 } 1174 } 1175 } 1176 return; 1177 } 1178 """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind, 1179 tc=CHARMAP[dtype_x]) 1180 from pygpu.gpuarray import SIZE, SSIZE 1181 params = [ 1182 SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, 1183 SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, 1184 SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32', 1185 gpuarray.GpuArray] 1186 return [Kernel(code=code, name=kname, params=params, 1187 flags=flags, objvar=k_var)] 1188 1189 def c_support_code_struct(self, node, nodename): 1190 return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """ 1191 int GpuArray_vector_add_fast(PyGpuArrayObject* py_self, 1192 PyGpuArrayObject* py_other, 1193 PyGpuArrayObject* indices_arr, 1194 const int set_instead_of_inc) 1195 { 1196 size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256); 1197 size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096); 1198 gpudata *errbuf; 1199 int err, kerr = 0; 1200 size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga); 1201 size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga); 1202 size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga); 1203 1204 if (threads_per_block > 0 && n_blocks > 0) { 1205 err = gpudata_property(py_self->ga.data, 1206 GA_CTX_PROP_ERRBUF, &errbuf); 1207 if (err != GA_NO_ERROR) { 1208 PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer"); 1209 return 1; 1210 } 1211 1212 err = k_vector_add_fast_call( 1213 1, &n_blocks, &threads_per_block, 0, 1214 PyGpuArray_DIMS(py_self)[0], 1215 PyGpuArray_DIMS(py_self)[1], 1216 PyGpuArray_STRIDES(py_self)[0] / itemsize_x, 1217 PyGpuArray_STRIDES(py_self)[1] / itemsize_x, 1218 py_self->ga.data, 1219 py_self->ga.offset, 1220 PyGpuArray_DIMS(py_other)[0], 1221 PyGpuArray_DIMS(py_other)[1], 1222 PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y, 1223 PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y, 1224 py_other->ga.data, 1225 py_other->ga.offset, 1226 PyGpuArray_DIMS(indices_arr)[0], 1227 PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind, 1228 indices_arr->ga.data, 1229 indices_arr->ga.offset, 1230 set_instead_of_inc, 1231 errbuf); 1232 1233 if (err != GA_NO_ERROR) { 1234 PyErr_Format(PyExc_RuntimeError, 1235 "gpuarray error: %(k_var)s: %%s.", 1236 GpuKernel_error(&%(k_var)s, err)); 1237 return 1; 1238 } 1239 err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); 1240 if (err != GA_NO_ERROR) { 1241 PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer"); 1242 return 1; 1243 } 1244 if (kerr != 0) { 1245 PyErr_SetString(PyExc_IndexError, "Index out of bounds"); 1246 kerr = 0; 1247 gpudata_write(errbuf, 0, &kerr, sizeof(int)); 1248 return 1; 1249 } 1250 } 1251 return 0; 1252 } 1253 """ % dict(k_var="k_vector_add_fast_" + nodename) 1254 1255 1256class GpuExtractDiag(Op): 1257 __props__ = ("offset", "axis1", "axis2", "view") 1258 _f16_ok = True 1259 1260 def __init__(self, offset=0, axis1=0, axis2=1, view=False): 1261 self.view = view 1262 if self.view: 1263 self.view_map = {0: [0]} 1264 self.offset = offset 1265 self.axis1 = axis1 1266 self.axis2 = axis2 1267 1268 def make_node(self, _x): 1269 ctx_name = infer_context_name(_x) 1270 x = as_gpuarray_variable(_x, ctx_name) 1271 1272 if x.ndim < 2: 1273 raise ValueError('Diagonal needs an input with 2 or more ' 1274 'dimensions', x) 1275 axis_small, axis_large = sorted((self.axis1, self.axis2)) 1276 broadcastable = x.broadcastable[:axis_small] + \ 1277 x.broadcastable[axis_small + 1:axis_large] + \ 1278 x.broadcastable[axis_large + 1:] + (False,) 1279 return gof.Apply(self, [x], [x.type.clone(broadcastable=broadcastable)()]) 1280 1281 def perform(self, node, inputs, outputs): 1282 (x,) = inputs 1283 (z,) = outputs 1284 # zero-dimensional matrices ... 1285 if x.size == 0: 1286 out_shape = [d for i, d in enumerate(x.shape) 1287 if i not in (self.axis1, self.axis2)] 1288 diag_size = np.min((x.shape[self.axis1], x.shape[self.axis2])) 1289 out_shape.append(diag_size) 1290 z[0] = node.outputs[0].type.value_zeros(tuple(out_shape)) 1291 return 1292 1293 # step 1) slicing on axis1 and axis2. 1294 if self.offset >= 0: 1295 stride_axis, slice_axis = self.axis1, self.axis2 1296 else: 1297 slice_axis, stride_axis = self.axis1, self.axis2 1298 1299 small_axis, large_axis = sorted((x.shape[self.axis1], 1300 x.shape[self.axis2])) 1301 1302 if x.shape[stride_axis] < x.shape[slice_axis]: 1303 # in the bigger triangle 1304 numstride = small_axis - np.max(( 1305 0, small_axis + np.abs(self.offset) - large_axis)) 1306 else: 1307 # in the smaller triangle 1308 numstride = small_axis - np.abs(self.offset) 1309 1310 slicer = [np.s_[:], ] * x.ndim 1311 slicer[stride_axis] = np.s_[:numstride] 1312 slicer[slice_axis] = np.abs(self.offset) 1313 slicer = tuple(slicer) 1314 1315 # step 2) Swap stride_axis to the last dim because we want the dim on 1316 # which the diags extracted be listed as the last dim of the tensor. 1317 # This is also in consistence with the interface of numpy.diagonal. 1318 if slice_axis < stride_axis: 1319 stride_axis -= 1 1320 new_dim_order = list(range(x[slicer].ndim)) 1321 new_dim_order = tuple(new_dim_order[:stride_axis] + 1322 new_dim_order[stride_axis + 1:] + 1323 [stride_axis, ]) 1324 rval = x[slicer].transpose(new_dim_order) 1325 1326 # step 3) modify the strides in the last axis, such that rval becomes 1327 # a view on the diagonal. 1328 other_strides = tuple([d for i, d in enumerate(x.strides) 1329 if i not in (self.axis1, self.axis2)]) 1330 rval.strides = other_strides + \ 1331 (x.strides[self.axis1] + x.strides[self.axis2], ) 1332 1333 if self.view: 1334 z[0] = rval 1335 else: 1336 z[0] = rval.copy() 1337 1338 def grad(self, inputs, gout): 1339 (input_x,) = inputs 1340 return [grad_not_implemented(self, 0, input_x)] 1341 1342 def infer_shape(self, node, shapes): 1343 in_shape, = shapes 1344 dim1 = in_shape[self.axis1] 1345 dim2 = in_shape[self.axis2] 1346 out_shape = [d for i, d in enumerate(in_shape) 1347 if i not in (self.axis1, self.axis2)] 1348 # The following logic is inspired by C code of PyArray_Diagonal(). 1349 offset = self.offset 1350 if offset > 0: 1351 diag_size = T.clip(dim2 - offset, 0, dim1) 1352 elif offset < 0: 1353 diag_size = T.clip(dim1 + offset, 0, dim2) 1354 else: 1355 diag_size = T.minimum(dim1, dim2) 1356 out_shape.append(diag_size) 1357 return [tuple(out_shape)] 1358 1359 1360class GpuAllocDiag(AllocDiag): 1361 __props__ = ("offset", "axis1", "axis2") 1362 1363 def make_node(self, diag): 1364 ctx_name = infer_context_name(diag) 1365 diag = as_gpuarray_variable(diag, ctx_name) 1366 if diag.type.ndim < 1: 1367 raise ValueError('AllocDiag needs an input with 1 or more ' 1368 'dimensions', diag.type) 1369 return gof.Apply( 1370 self, [diag], 1371 [diag.type.__class__( 1372 dtype=diag.dtype, 1373 broadcastable=[False] * (diag.ndim + 1))()] 1374 ) 1375 1376 def perform(self, node, inputs, outputs): 1377 (x,) = inputs 1378 (z,) = outputs 1379 axis1 = np.minimum(self.axis1, self.axis2) 1380 axis2 = np.maximum(self.axis1, self.axis2) 1381 offset = self.offset 1382 1383 # Initialise a buffer the same size as the output 1384 result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2 1385 result_buffer_shape = ((np.prod(x.shape[:-1]).astype(np.int64),) + 1386 (x.shape[-1] + abs(offset),) * 2) 1387 result_buffer = gpuarray.zeros(result_buffer_shape, 1388 dtype=x.dtype, 1389 context=x.context) 1390 1391 # Slice out a view of the diagonals 1392 if offset < 0: # diag in the lower triangle 1393 diag_view = result_buffer[:, abs(offset):, 0] 1394 else: # diag in the upper triangle 1395 diag_view = result_buffer[:, :x.shape[-1], abs(offset)] 1396 diag_view.strides = (diag_view.strides[0], 1397 diag_view.strides[1] + x.dtype.itemsize) 1398 1399 # Fill view with flattened array of diagonals 1400 diag_view[:] = x.reshape(diag_view.shape)[:] 1401 1402 # Unflatten buffer into output size 1403 result = result_buffer.reshape(result_shape) 1404 1405 if len(x.shape) > 1: 1406 # Re-order axes so they correspond to diagonals at axis1, axis2 1407 axes = list(range(len(x.shape[:-1]))) 1408 last_idx = axes[-1] 1409 axes = axes[:axis1] + [last_idx + 1] + axes[axis1:] 1410 axes = axes[:axis2] + [last_idx + 2] + axes[axis2:] 1411 result = result.transpose(axes) 1412 1413 z[0] = result 1414 1415 def grad(self, inputs, gout): 1416 (gz,) = gout 1417 return [GpuExtractDiag(offset=self.offset, axis1=self.axis1, axis2=self.axis2)(gz)] 1418