1from __future__ import absolute_import, print_function, division 2import copy 3import numpy as np 4 5import theano 6from theano import Apply, scalar, Op 7from six.moves import StringIO, xrange 8from theano.gof.utils import MethodNotDefined 9from theano.scalar import Scalar, Composite 10from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype) 11from theano.scalar.basic_scipy import Erfinv, Erfcinv 12from theano.scalar.basic import upgrade_to_float_no_complex, complex_types 13 14try: 15 import pygpu 16 from pygpu import gpuarray 17 from pygpu.tools import ArrayArg 18 from pygpu.reduction import ReductionKernel 19 from pygpu.gpuarray import dtype_to_typecode 20except ImportError: 21 pass 22 23from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, 24 infer_context_name) 25from .type import GpuArrayType, gpu_context_type 26from .fp16_help import load_w, write_w 27 28 29def make_argument(v, name): 30 return ArrayArg(np.dtype(v.type.dtype), name) 31 32 33def as_C_string_const(s): 34 return '\n'.join('"%s\\n"' % (l.replace('"', '\\"')) 35 for l in s.split('\n')) 36 37 38def get_scal(dt): 39 if dt == 'float16': 40 dt = 'float32' 41 return scalar.get_scalar_type(dt) 42 43 44def max_inputs_to_GpuElemwise(node_or_outputs): 45 """ 46 Compute the maximum number of inputs that fit in a kernel call. 47 """ 48 if isinstance(node_or_outputs, Apply): 49 outputs = node_or_outputs.outputs 50 else: 51 outputs = node_or_outputs 52 53 n_out = len(outputs) 54 ndim = outputs[0].type.ndim 55 56 ptr_size = 8 57 # Even with call32, the interface does not change, and shapes, 58 # strides, and offset are passed as 64-bits (8 bytes) 59 int_size = 8 60 61 # we take the limit from CUDA for now 62 nb_bytes_total = 4096 63 64 # Regardless of the number of arguments, we have: 65 # - The total number of elements (int) 66 # - The shape (int) on each dimension 67 fixed_size = int_size + int_size * ndim 68 69 # Each argument (input or output) has: 70 # - 1 pointer (ptr) 71 # - 1 offset (int) 72 # - 1 stride (int) per dimension 73 # Even if the tensor ends up being contiguous, code for the 74 # non-contiguous case still needs to be generated. 75 param_size = ptr_size + int_size + int_size * ndim 76 77 # Remaining for inputs 78 nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out 79 80 # Maximum number of inputs 81 max_nb_inputs = nb_bytes_for_inputs // param_size 82 83 return max_nb_inputs 84 85 86class GpuElemwise(HideC, Elemwise): 87 """ 88 Elemwise on the GPU. 89 90 """ 91 params_type = gpu_context_type 92 nin = property(lambda self: self.scalar_op.nin) 93 nout = property(lambda self: self.scalar_op.nout) 94 _f16_ok = True 95 96 def __str__(self): 97 if self.name is not None: 98 return self.name 99 items = str(sorted(self.inplace_pattern.items())) 100 return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items) 101 102 def max_inputs(self, node_or_outputs): 103 return max_inputs_to_GpuElemwise(node_or_outputs) 104 105 def make_node(self, *inputs): 106 ctx_name = infer_context_name(*inputs) 107 inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs] 108 out_info = Elemwise.get_output_info(self, GpuDimShuffle, *inputs) 109 inputs = out_info[2] 110 outputs = [GpuArrayType(broadcastable=br, 111 context_name=ctx_name, 112 dtype=dtype)() for dtype, br in 113 zip(out_info[0], out_info[1])] 114 if len(outputs) > 1: 115 raise NotImplementedError() 116 117 if len(inputs) > max_inputs_to_GpuElemwise(outputs): 118 raise NotImplementedError( 119 "Can not make this GpuElemwise with that much inputs") 120 121 # Try to generate the kernel to catch SupportCodeErrors 122 scal_ins = [get_scal(i.dtype) for i in inputs] 123 fake_node = self.scalar_op.make_node(*[i() for i in scal_ins]) 124 try: 125 code = fake_node.op.c_support_code_apply(fake_node, "test") 126 if code: 127 raise SupportCodeError(code) 128 except MethodNotDefined: 129 pass 130 try: 131 support_code = fake_node.op.c_support_code() 132 if "struct" in support_code: 133 # The macro is fine, the C++ struct is not. 134 raise SupportCodeError( 135 "struct aren't supported in GpuElemwise support_code" + 136 support_code) 137 except MethodNotDefined: 138 pass 139 140 node = Apply(self, inputs, outputs) 141 return node 142 143 def get_params(self, node): 144 return node.inputs[0].type.context 145 146 def _get_vnames(self, node): 147 inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)] 148 outs = ['o%d' % (n,) if n not in self.inplace_pattern else 149 inps[self.inplace_pattern[n]] 150 for n, _ in enumerate(node.outputs)] 151 return inps, outs 152 153 def _generate_op_string(self, node): 154 inps, outs = self._get_vnames(node) 155 scal_v_ins = [get_scal(i.dtype)() for i in node.inputs] 156 157 # As float16 isn't a c type and most GPU don't compute on it, 158 # We convert the computation to float32, and let libgpuarray 159 # load in float16 and cast to float32 and do the reverse for 160 # the output. 161 scalar_op = self.scalar_op 162 if isinstance(scalar_op, (scalar.Cast, Composite)): 163 scalar_op = scalar_op.clone_float32() 164 fake_node = scalar_op.make_node(*scal_v_ins) 165 scal_v_out = fake_node.outputs 166 assert len(scal_v_out) == len(node.outputs) 167 168 try: 169 kop = fake_node.op.c_code(fake_node, 'elem_scalar', 170 inps, outs, 171 dict(fail='return;')) 172 except MethodNotDefined: 173 raise AssertionError( 174 "No c code for this scalar. Can not make a GpuElemwise") 175 # If the following assert fail, then we need to update the 176 # code handler above. 177 assert 'npy_float16' not in kop 178 179 support_code = "" 180 try: 181 # We accept only some c_support_code(). 182 # This filter is done in the make_node() 183 support_code += fake_node.op.c_support_code() 184 except MethodNotDefined: 185 pass 186 for npy, ga in [("npy_bool", "ga_bool"), 187 ("npy_uint8", "ga_ubyte"), 188 ("npy_uint16", "ga_ushort"), 189 ("npy_uint32", "ga_uint"), 190 ("npy_uint64", "ga_ulong"), 191 ("npy_int8", "ga_byte"), 192 ("npy_int16", "ga_short"), 193 ("npy_int32", "ga_int"), 194 ("npy_int64", "ga_long"), 195 ("npy_float16", "ga_half"), 196 ("npy_float32", "ga_float"), 197 ("npy_float64", "ga_double"), 198 ]: 199 kop = kop.replace(npy, ga) 200 return support_code, kop 201 202 def c_headers(self): 203 return ['<numpy_compat.h>', '<gpuarray/types.h>', 204 '<gpuarray/elemwise.h>'] 205 206 def c_support_code_struct(self, node, name): 207 return "\nGpuElemwise *ge;\n" 208 209 def c_init_code_struct(self, node, name, sub): 210 inps, outs = self._get_vnames(node) 211 nargs = len(inps) + len(outs) - len(self.inplace_pattern) 212 support_code, kop = self._generate_op_string(node) 213 res = """ 214 gpuelemwise_arg args[%(nargs)s] = {{0}}; 215 """ % dict(nargs=nargs) 216 217 for n, (i, name) in enumerate(zip(node.inputs, inps)): 218 res += """ 219 args[%(n)s].name = %(name)s; 220 args[%(n)s].typecode = %(typecode)s; 221 args[%(n)s].flags = GE_READ; 222 """ % dict(n=n, name='"%s"' % (name,), 223 typecode=i.type.typecode) 224 225 p = len(inps) 226 for n, o in enumerate(node.outputs): 227 if n in self.inplace_pattern: 228 assert(len(node.outputs) == 1) 229 res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n]) 230 else: 231 res += """ 232 args[%(n)s].name = %(name)s; 233 args[%(n)s].typecode = %(typecode)s; 234 args[%(n)s].flags = GE_WRITE; 235 """ % dict(n=p, name='"%s"' % (outs[n],), 236 typecode=o.type.typecode) 237 p += 1 238 239 res += """ 240 ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, GE_CONVERT_F16); 241 if (ge == NULL) { 242 PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support"); 243 %(fail)s 244 } 245 """ % dict(nargs=nargs, ctx=sub['params'], fail=sub['fail'], 246 support=as_C_string_const(support_code), 247 kop=as_C_string_const(kop), nd=node.inputs[0].ndim) 248 249 return res 250 251 def c_cleanup_code_struct(self, node, name): 252 return """ 253 GpuElemwise_free(ge); 254 """ 255 256 def c_code(self, node, name, inputs, outputs, sub): 257 nd = node.outputs[0].ndim 258 fail = sub["fail"] 259 initial_dims = ','.join('1' for i in xrange(nd)) 260 opname = str(self.scalar_op) 261 ctx = sub['params'] 262 nargs = len(node.inputs) + len(node.outputs) - len(self.inplace_pattern) 263 264 # check that all inputs have valid dimensions 265 emitted_inames = {} 266 code = """ 267 // +1 is so that MSVC is happy when nd == 0 268 size_t dims[%(nd)s+1] = {%(initial_dims)s}; 269 void *rargs[%(nargs)s] = {0}; 270 int err; 271 """ % locals() 272 for idx, iname in enumerate(inputs): 273 if iname in emitted_inames: 274 assert emitted_inames[iname] is node.inputs[idx] 275 continue 276 277 broadcasts = map(int, node.inputs[idx].broadcastable) 278 broadcasts = ', '.join(map(str, broadcasts)) 279 nd = node.inputs[idx].ndim 280 code += """ 281 int broadcasts_%(iname)s[%(nd)s+1] = {%(broadcasts)s}; 282 """ % locals() 283 emitted_inames[iname] = node.inputs[idx] 284 285 # check that all inputs have valid dimensions 286 emitted_inames = {} 287 for idx, iname in enumerate(inputs): 288 code += "rargs[%(idx)s] = &%(iname)s->ga;\n" % dict(idx=idx, iname=iname) 289 if iname in emitted_inames: 290 continue 291 code += """ 292 if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) 293 { 294 PyErr_Format(PyExc_TypeError, 295 "need %(nd)s dims, not %%u", 296 PyGpuArray_NDIM(%(iname)s)); 297 %(fail)s; 298 } 299 for (int i = 0; i< %(nd)s; ++i) 300 { 301 dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i]; 302 if ((!(broadcasts_%(iname)s[i] && 303 PyGpuArray_DIMS(%(iname)s)[i] == 1)) && 304 (dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) 305 { 306 PyErr_Format(PyExc_ValueError, 307 "GpuElemwise. Input dimension mis-match. Input" 308 " %(idx)d (indices start at 0) has shape[%%d] == %%llu" 309 ", but the output's size on that axis is %%llu.", 310 i, 311 (unsigned long long)PyGpuArray_DIMS(%(iname)s)[i], 312 (unsigned long long)dims[i] 313 ); 314 %(fail)s; 315 } 316 } 317 """ % locals() 318 emitted_inames[iname] = True 319 # check that all outputs have valid dimensions 320 p = len(node.inputs) 321 for idx, oname in enumerate(outputs): 322 typecode = dtype_to_typecode(node.outputs[idx].dtype) 323 if idx not in self.inplace_pattern.keys(): 324 code += """ 325 for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { 326 if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) 327 { 328 Py_DECREF(%(oname)s); 329 %(oname)s = NULL; 330 } 331 } 332 if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS)) 333 { 334 Py_XDECREF(%(oname)s); 335 %(oname)s = NULL; 336 } 337 if (NULL == %(oname)s) 338 { 339 %(oname)s = pygpu_empty(%(nd)d, dims, 340 %(typecode)s, GA_C_ORDER, 341 %(ctx)s, Py_None); 342 if (!%(oname)s) { 343 %(fail)s 344 } 345 } 346 rargs[%(p)s] = &%(oname)s->ga; 347 """ % locals() 348 p += 1 349 else: 350 input_idx = self.inplace_pattern[idx] 351 iname = inputs[input_idx] 352 code += """ 353 Py_XDECREF(%(oname)s); 354 %(oname)s = %(iname)s; 355 Py_INCREF(%(oname)s); 356 for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { 357 if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) 358 { 359 PyErr_Format(PyExc_ValueError, 360 "GpuElemwise. Output dimension mis-match. Output" 361 " %(idx)d (indices start at 0), working inplace" 362 " on input %(input_idx)s, has shape[%%i] == %%llu" 363 ", but the output's size on that axis is %%llu.", 364 i, 365 (unsigned long long)PyGpuArray_DIMS(%(oname)s)[i], 366 (unsigned long long)dims[i] 367 ); 368 Py_DECREF(%(oname)s); 369 %(oname)s = NULL; 370 %(fail)s; 371 } 372 } 373 """ % locals() 374 375 code += """ 376 if (GpuElemwise_call(ge, rargs, GE_BROADCAST) != GA_NO_ERROR) { 377 PyErr_SetString(PyExc_RuntimeError, "Error in the elemwise call"); 378 %(fail)s 379 } 380 """ % dict(fail=sub['fail']) 381 382 return str(code) 383 384 # To disable the superclass perform. 385 perform = Op.perform 386 387 # Since we don't have a perform ... 388 def python_constant_folding(self, node): 389 return False 390 391 def c_code_cache_version(self): 392 ver = self.scalar_op.c_code_cache_version() 393 if ver: 394 return (10, ver) 395 else: 396 return ver 397 398 399class SupportCodeError(Exception): 400 """ 401 We do not support certain things (such as the C++ complex struct). 402 403 """ 404 405 406class GpuDimShuffle(DimShuffle): 407 """ 408 DimShuffle on the GPU. 409 410 """ 411 _f16_ok = True 412 c_func_name = 'APPLY_SPECIFIC(gpu_dimshuffle)' 413 414 def make_node(self, input): 415 ctx_name = infer_context_name(input) 416 res = DimShuffle.make_node(self, input) 417 otype = GpuArrayType(dtype=res.outputs[0].type.dtype, 418 broadcastable=res.outputs[0].type.broadcastable, 419 context_name=ctx_name) 420 input = as_gpuarray_variable(input, ctx_name) 421 return Apply(self, [input], [otype()]) 422 423 def __str__(self): 424 if self.inplace: 425 s = "InplaceGpuDimShuffle{%s}" 426 else: 427 s = "GpuDimShuffle{%s}" 428 return s % (','.join(str(x) for x in self.new_order)) 429 430 def perform(self, node, inp, out, params): 431 input, = inp 432 storage, = out 433 434 res = input 435 436 res = res.transpose(self.shuffle + self.drop) 437 438 shape = list(res.shape[:len(self.shuffle)]) 439 for augm in self.augment: 440 shape.insert(augm, 1) 441 res = res.reshape(shape) 442 443 if not self.inplace: 444 res = res.copy() 445 446 storage[0] = res 447 448 449class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): 450 """ 451 GpuCAReduceCuda is a Reduction along some dimensions by a scalar op. 452 453 Parameters 454 ---------- 455 reduce_mask 456 The dimensions along which to reduce. The `reduce_mask` is a tuple of 457 booleans (actually integers 0 or 1) that specify for each input 458 dimension, whether to reduce it (1) or not (0). 459 pre_scalar_op 460 If present, must be a scalar op with only 1 input. We will execute it 461 on the input value before reduction. 462 463 Examples 464 -------- 465 When scalar_op is a theano.scalar.basic.Add instance: 466 467 - reduce_mask == (1,) sums a vector to a scalar 468 469 - reduce_mask == (1,0) computes the sum of each column in a matrix 470 471 - reduce_mask == (0,1) computes the sum of each row in a matrix 472 473 - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor. 474 475 Notes 476 ----- 477 Any reduce_mask of all zeros is a sort of 'copy', and may be removed during 478 graph optimization. 479 480 This Op is a work in progress. 481 482 This op was recently upgraded from just GpuSum a general CAReduce. Not 483 many code cases are supported for scalar_op being anything other than 484 scalar.Add instances yet. 485 486 Important note: if you implement new cases for this op, be sure to 487 benchmark them and make sure that they actually result in a speedup. 488 GPUs are not especially well-suited to reduction operations so it is 489 quite possible that the GPU might be slower for some cases. 490 491 """ 492 __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op', 493 'pre_scalar_op') 494 _f16_ok = True 495 verbose = 0 496 497 def __init__(self, scalar_op, axis=None, 498 reduce_mask=None, dtype=None, acc_dtype=None, 499 pre_scalar_op=None): 500 if reduce_mask is not None: 501 reduce_mask = tuple(reduce_mask) 502 self.reduce_mask = reduce_mask 503 504 # used to make sure that calls to scalar op 505 # have unique name arguments 506 self._n_scalar_op_calls = 0 507 CAReduceDtype.__init__(self, scalar_op, axis=axis, 508 dtype=dtype, acc_dtype=acc_dtype) 509 self.pre_scalar_op = pre_scalar_op 510 if pre_scalar_op: 511 assert pre_scalar_op.nin == 1 512 513 def __str__(self): 514 pre = "" 515 if self.pre_scalar_op: 516 pre = "pre=%s,red=" % str(self.pre_scalar_op) 517 ax = '' 518 if self.axis is not None: 519 ax = '{%s}' % (', '.join(str(x) for x in self.axis),) 520 return "GpuCAReduceCuda{%s%s}%s" % (pre, str(self.scalar_op), ax) 521 522 def __setstate__(self, d): 523 self.__dict__.update(d) 524 # For unpickling of old ops. 525 if not hasattr(self, "pre_scalar_op"): 526 self.pre_scalar_op = None 527 528 def make_node(self, x): 529 x = as_gpuarray_variable(x, infer_context_name(x)) 530 if x.type.context.kind != b'cuda': 531 raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices") 532 ret = super(GpuCAReduceCuda, self).make_node(x) 533 self = copy.copy(self) 534 self.axis = ret.op.axis 535 if self.pre_scalar_op: 536 # Currently we only tested pre_scalar_op that don't cause 537 # upcast. 538 assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype 539 if self.reduce_mask is None: 540 if self.axis is None: 541 reduce_mask = [1] * x.type.ndim 542 else: 543 reduce_mask = [0] * x.type.ndim 544 for a in self.axis: 545 assert reduce_mask[a] == 0 546 reduce_mask[a] = 1 547 self.reduce_mask = tuple(reduce_mask) 548 549 if (x.type.ndim != len(self.reduce_mask)): 550 raise TypeError("x must have rank %i" % len(self.reduce_mask)) 551 if ("complex" in x.dtype or 552 "complex" in ret.outputs[0].dtype or 553 "complex" in self._acc_dtype(x.dtype)): 554 raise NotImplementedError("We don't support complex in gpu reduction") 555 return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype, 556 ret.outputs[0].type.broadcastable, 557 context_name=x.type.context_name)()]) 558 559 def perform(self, node, inp, out, ctx): 560 theano.Op.perform(self, node, inp, out, ctx) 561 562 def supports_c_code(self, inputs): 563 """ 564 Returns True if the current op and reduce pattern has functioning C code. 565 566 """ 567 # If we don't even have the right method, we certainly 568 # don't support the C code 569 # (This is the test that used to be implemented by 570 # local_gpu_sum) 571 pattern = (''.join(str(i) for i in self.reduce_mask)) 572 if not hasattr(self, 'c_code_reduce_%s' % pattern): 573 return False 574 575 # Now that this is a general reduction op, we might 576 # have a method for a pattern, but that pattern 577 # might not be implemented for the current scalar op. 578 # To detect this more complicated situation, we 579 # make fake arguments to c_code, try to run them, 580 # and see if NotImplementedError gets raised. 581 582 node = self.make_node(*inputs) 583 584 name = 'fake_name' 585 586 inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))] 587 out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))] 588 589 sub = {'fail': 'fake failure code', 'params': 'fake context'} 590 591 try: 592 self.c_code(node, name, inp, out, sub) 593 if not self.gpu_kernels(node, name): 594 return False 595 except NotImplementedError: 596 return False 597 return True 598 599 def c_headers(self): 600 return ['<numpy_compat.h>', '<gpuarray/types.h>'] 601 602 def c_support_code(self): 603 return """ 604 template <typename T> 605 static T ceil_intdiv(T a, T b) 606 { 607 return (a/b) + ((a % b) ? 1: 0); 608 } 609 """ 610 611 def c_code(self, node, name, inp, out, sub): 612 x, = inp 613 z, = out 614 615 nd_in = node.inputs[0].type.ndim 616 nd_out = node.outputs[0].type.ndim 617 # For complex, we need to use theano_complex* in the c code to 618 # have it run. But libgpuarray don't understand it. 619 in_dtype = node.inputs[0].type.dtype_specs()[1] 620 out_dtype = node.outputs[0].type.dtype_specs()[1] 621 gin_dtype = "npy_" + node.inputs[0].dtype 622 gout_dtype = "npy_" + node.outputs[0].dtype 623 assert nd_in - nd_out == sum(self.reduce_mask) 624 625 sio = StringIO() 626 fail = sub['fail'] 627 ctx = sub['params'] 628 629 # check input 630 print(""" 631 if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s) 632 { 633 PyErr_Format(PyExc_TypeError, 634 "required nd=%(nd_in)s, got nd=%%u", PyGpuArray_NDIM(%(x)s)); 635 %(fail)s; 636 } 637 """ % locals(), file=sio) 638 639 # It might be nice to use a property of the op class to do this, 640 # but tensor.elemwise.CAReduce has this exact same check so I guess 641 # this is OK to do 642 if self.scalar_op in [scalar.minimum, scalar.maximum]: 643 conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i) 644 for i in xrange(nd_in) 645 if self.reduce_mask[i]] 646 assert len(conds) > 0 647 cond = "(" + " || ".join(conds) + ")" 648 print(""" 649 if %(cond)s 650 { 651 PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis."); 652 %(fail)s; 653 } 654 """ % locals(), file=sio) 655 656 # 657 # alloc an output if we need one 658 # 659 660 # check the basics of out output 661 print(""" 662 if ( !%(z)s 663 || (PyGpuArray_NDIM(%(z)s) != %(nd_out)s) 664 """ % locals(), file=sio) 665 666 # ensure that the output has the right non-reduced dimensions 667 j = 0 668 for i in xrange(nd_in): 669 if not self.reduce_mask[i]: 670 print(" || (PyGpuArray_DIMS(%(z)s)[%(j)s] != PyGpuArray_DIMS(%(x)s)[%(i)d]) " % locals(), file=sio) 671 j += 1 672 673 print(""" 674 ) 675 { 676 """ % locals(), file=sio) 677 if nd_out > 0: 678 print("size_t new_dims[%(nd_out)s]; " % locals(), file=sio) 679 else: 680 print("size_t *new_dims=NULL; ", file=sio) 681 682 j = 0 683 for i in xrange(nd_in): 684 if not self.reduce_mask[i]: 685 print('new_dims[%(j)s] = PyGpuArray_DIMS(%(x)s)[%(i)s];' % locals(), file=sio) 686 j += 1 687 out_typecode = dtype_to_typecode(gout_dtype[4:]) 688 print(""" 689 Py_XDECREF(%(z)s); 690 %(z)s = pygpu_empty(%(nd_out)s, new_dims, 691 %(out_typecode)s, GA_C_ORDER, 692 %(ctx)s, Py_None); 693 if (NULL == %(z)s) 694 { 695 PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); 696 %(fail)s; 697 } 698 } 699 """ % locals(), file=sio) 700 701 # \begin bracket the reduction in a check that there is 702 # actually work to do 703 if getattr(self.scalar_op, 'identity', None) == 0: 704 zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals() 705 # TODO: elif getattr(self.scalar_op, 'identity', None) == 1: 706 else: 707 scalar_op = self.scalar_op 708 zero_shp = """ 709 PyErr_Format(PyExc_NotImplementedError, 710 "GpuCAReduceCuda not implemented when input shape is 0" 711 " for this scalar_op: %(scalar_op)s"); 712 %(fail)s; 713 """ % locals() 714 print(""" 715 if (PyGpuArray_SIZE(%(z)s) && ! PyGpuArray_SIZE(%(x)s)){ 716 %(zero_shp)s; 717 } 718 else if (PyGpuArray_SIZE(%(z)s)) 719 { 720 """ % locals(), file=sio) 721 722 # 723 # Now perform the reduction 724 # 725 726 if all(i == 1 for i in self.reduce_mask): 727 # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code. 728 # TODO: check if we are ccontiguous when we un-dimshuffle 729 # TODO: if only some dims are ccontiguous, call version with less dims. 730 print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(), 731 file=sio) 732 self.c_code_reduce_ccontig(sio, node, name, x, z, fail) 733 print("}else{", file=sio) 734 getattr(self, 'c_code_reduce_%s' % 735 (''.join(str(i) for i in self.reduce_mask)))( 736 sio, node, name, x, z, fail) 737 print("}", file=sio) 738 else: 739 getattr(self, 'c_code_reduce_%s' % (''.join( 740 str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) 741 742 # \end bracket the reduction ... 743 print(""" 744 } 745 """ % locals(), file=sio) 746 747 return sio.getvalue() 748 749 def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()): 750 """ 751 Return a string for making a kernel call. 752 753 The return value looks something like: 754 755 .. code-block:: c 756 757 ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s); 758 ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s); 759 ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s); 760 if (verbose) 761 printf("running kernel_reduce_10_%(name)s\\n"); 762 size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2]; 763 void *kernel_params[] = { 764 (void *)&PyGpuArray_DIMS(%(x)s)[0], 765 (void *)&PyGpuArray_DIMS(%(x)s)[1], 766 (void *)%(x)s->ga.data, 767 (void *)&%(x)s->ga.offset, 768 (void *)&stride_A0, 769 (void *)&stride_A1, 770 (void *)%(z)s->ga.data, 771 (void *)&%(z)s->ga.offset, 772 (void *)&stride_Z0}; 773 int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params); 774 %(err_check)s 775 """ 776 in_dtype = "npy_" + node.inputs[0].dtype 777 out_dtype = "npy_" + node.outputs[0].dtype 778 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 779 sio = StringIO() 780 if pattern is None: 781 pattern = ''.join(str(c) for c in self.reduce_mask) 782 ndim = len(self.reduce_mask) 783 nd_out = ndim - sum(self.reduce_mask) 784 shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim) 785 shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i) 786 for i in range(node.inputs[0].ndim)]) 787 k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals() 788 params = [] 789 790 for i in xrange(ndim): 791 params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals()) 792 for declaration, value in extra_dims: 793 print(declaration % locals(), file=sio) 794 params.append(value) 795 params.append("(void *)%(x)s->ga.data" % locals()) 796 params.append("(void *)&%(x)s->ga.offset" % locals()) 797 for i in xrange(ndim): 798 print(""" 799 ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s); 800 """ % locals(), file=sio) 801 params.append("(void *)&stride_A%(i)d" % locals()) 802 for declaration, value in extra_strides: 803 print(declaration % locals(), file=sio) 804 params.append(value) 805 806 params.append("(void *)%(z)s->ga.data" % locals()) 807 params.append("(void *)&%(z)s->ga.offset" % locals()) 808 for i in xrange(nd_out): 809 print(""" 810 ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s); 811 """ % locals(), file=sio) 812 params.append("(void *)&stride_Z%(i)d" % locals()) 813 kernel_params = ', '.join(params) 814 err_check = """ 815 if (err != GA_NO_ERROR) { 816 PyErr_Format(PyExc_RuntimeError, 817 "gpuarray error: %(k_var)s: %%s.", 818 GpuKernel_error(&%(k_var)s, err)); 819 %(fail)s; 820 } 821 """ % locals() 822 print(""" 823 if (verbose) 824 printf("running kernel_reduce_%(pattern)s_%(name)s\\n"); 825 size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2]; 826 void *kernel_params[] = { %(kernel_params)s }; 827 if (verbose>1) 828 printf("n_threads[0]=%%lu, n_threads[1]=%%lu, " 829 "n_threads[2]=%%lu, n_threads=%%lu, " 830 "n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, " 831 "n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n", 832 n_threads[0],n_threads[1], 833 n_threads[2], 834 n_threads[0]*n_threads[1]* 835 n_threads[2], 836 n_blocks[0],n_blocks[1],n_blocks[2], 837 n_blocks[0]*n_blocks[1]*n_blocks[2], 838 n_shared, %(shapes_data)s); 839 int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params); 840 %(err_check)s 841 """ % locals(), file=sio) 842 843 return sio.getvalue() 844 845 def _k_decl(self, node, nodename, pattern=None, 846 ndim=None, reduce_mask=None): 847 """ 848 Return a string to declare a kernel function. 849 850 The result will look something like this: 851 852 .. code-block:: c 853 854 KERNEL void kernel_reduce_110_%(nodename)s( 855 const ga_size d0, 856 const ga_size d1, 857 const ga_size d2, 858 const %(in_type)s *A, 859 const ga_size offset_A, 860 const ga_ssize sA0, 861 const ga_ssize sA1, 862 const ga_ssize sA2, 863 %(out_type)s * Z, 864 const ga_size offset_Z, 865 const ga_ssize sZ0) 866 867 Since the nodename is unique, we don't need to put the name 868 of the scalar_op in here. 869 870 """ 871 in_dtype = node.inputs[0].dtype 872 out_dtype = node.outputs[0].dtype 873 in_type = gpuarray.dtype_to_ctype(in_dtype) 874 out_type = gpuarray.dtype_to_ctype(out_dtype) 875 if reduce_mask is None: 876 reduce_mask = self.reduce_mask 877 if ndim is None: 878 ndim = len(reduce_mask) 879 if pattern is None: 880 pattern = ''.join(str(i) for i in reduce_mask) 881 kname = "kernel_reduce_%(pattern)s" % locals() 882 k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals() 883 params = [] 884 sio = StringIO() 885 886 print(""" 887 KERNEL void %(kname)s( 888 """ % locals(), file=sio) 889 for i in xrange(ndim): 890 params.append('uintp') 891 print(""" 892 const ga_size d%(i)s, 893 """ % locals(), file=sio) 894 params.append(gpuarray.GpuArray) 895 params.append('uintp') 896 print(""" 897 const %(in_type)s *A, const ga_size offset_A, 898 """ % locals(), file=sio) 899 for i in xrange(ndim): 900 params.append('intp') 901 print(""" 902 const ga_ssize sA%(i)s, 903 """ % locals(), file=sio) 904 params.append(gpuarray.GpuArray) 905 params.append('uintp') 906 print(""" 907 %(out_type)s * Z, const ga_size offset_Z 908 """ % locals(), file=sio) 909 for i in xrange(ndim - sum(reduce_mask)): 910 params.append('intp') 911 print(""" 912 , const ga_ssize sZ%(i)s 913 """ % locals(), file=sio) 914 print(")", file=sio) 915 return sio.getvalue(), kname, params, k_var 916 917 def _k_init(self, node, nodename): 918 in_dtype = node.inputs[0].dtype 919 out_dtype = node.outputs[0].dtype 920 acc_dtype = self._acc_dtype(node.inputs[0].dtype) 921 # We need to use theano_complex* and not npy_complex* 922 in_type = gpuarray.dtype_to_ctype(in_dtype) 923 out_type = gpuarray.dtype_to_ctype(out_dtype) 924 acc_type = gpuarray.dtype_to_ctype(acc_dtype) 925 926 return """ 927 const int threadCount = blockDim.x * blockDim.y * blockDim.z; 928 const int threadNum = threadIdx.z * blockDim.x * blockDim.y 929 + threadIdx.y * blockDim.x + threadIdx.x; 930 extern __shared__ %(acc_type)s buf[]; 931 A = (const %(in_type)s *)(((char *)A)+offset_A); 932 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 933 %(acc_type)s myresult = 0; 934 """ % locals() 935 936 def _assign_init(self, first_item, dtype): 937 """ 938 This return the initial value for myresult. 939 If the scalar op have an identity value, return it. 940 941 Otherwise, check that the scalar op is maximum or minimum 942 and return first_item. It should be the first element of the reduction. 943 As the maximum and minimum of the same value don't change, this work. 944 945 """ 946 if hasattr(self.scalar_op, 'identity'): 947 return str(self.scalar_op.identity) 948 else: 949 assert isinstance(self.scalar_op, (scalar.Maximum, 950 scalar.Minimum)) 951 if self.pre_scalar_op: # TODO: multiple dtypes 952 # dtype = node.inputs[0].dtype 953 954 dummy_var = scalar.Scalar(dtype=dtype)() 955 956 dummy_node = self.pre_scalar_op.make_node(dummy_var) 957 958 dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls) 959 self._n_scalar_op_calls += 1 960 t = self.pre_scalar_op.c_code(dummy_node, dummy_name, 961 (first_item,), ("",), {}) 962 assert t.startswith(' = ') 963 first_item = t[3:] 964 if first_item[-1] == ';': 965 first_item = first_item[:-1] 966 967 return first_item 968 969 def _assign_reduce(self, node, name, left, right, sub, pre): 970 """ 971 972 Parameters 973 ---------- 974 node 975 The node argument to this op's c_code. 976 name 977 The name argument to this op's c_code. 978 left 979 A C code string identifying an lvalue. 980 right 981 A C code string identifying an expression. 982 sub 983 The sub argument to this op's c_code. 984 pre 985 If True, we will add the pre_scalar_op.c_code. 986 987 Returns 988 ------- 989 str 990 C code to reduce left and right, assigning the result to left. 991 992 """ 993 994 x, = node.inputs 995 in_dtype = x.dtype 996 out_dtype = node.outputs[0].dtype 997 998 dummy_left = Scalar(dtype=out_dtype)() 999 dummy_right = Scalar(dtype=in_dtype)() 1000 1001 dummy_node = self.scalar_op.make_node(dummy_left, dummy_right) 1002 1003 dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls) 1004 self._n_scalar_op_calls += 1 1005 1006 if pre and self.pre_scalar_op: 1007 assert left == "myresult" 1008 dummy_node = self.pre_scalar_op.make_node(dummy_left) 1009 dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls) 1010 self._n_scalar_op_calls += 1 1011 t = self.pre_scalar_op.c_code(dummy_node, dummy_name, 1012 (right,), ("",), sub) 1013 assert t.startswith(' = ') 1014 right = t[3:] 1015 if right[-1] == ';': 1016 right = right[:-1] 1017 1018 return self.scalar_op.c_code(dummy_node, dummy_name, (left, right), 1019 (left,), sub) 1020 1021 def _k_reduce_buf(self, z_pos, node, name, sub): 1022 """ 1023 WRITEME 1024 1025 Parameters 1026 ---------- 1027 node, name, sub 1028 These should be passed through from the original call to c_code. 1029 1030 """ 1031 in_dtype = "npy_" + node.inputs[0].dtype 1032 out_dtype = "npy_" + node.outputs[0].dtype 1033 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 1034 write_out = write_w(node.outputs[0].dtype) 1035 1036 current_version = """ 1037 __syncthreads(); // some kernel do multiple reduction. 1038 buf[threadNum] = myresult; 1039 __syncthreads(); 1040 1041 // rest of function is handled by one warp 1042 if (threadNum < warpSize) { 1043 //round up all the partial sums into the first `warpSize` elements 1044 for (int i = threadNum + warpSize; i < threadCount; i += warpSize) 1045 { 1046 """ 1047 current_version += self._assign_reduce(node, name, 1048 'myresult', 'buf[i]', 1049 sub, False) + """ 1050 } 1051 buf[threadNum] = myresult; 1052 } 1053 __syncthreads(); 1054 for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) { 1055 if (threadNum < _n && threadNum + _n < threadCount) 1056 """ 1057 current_version += self._assign_reduce(node, name, 'buf[threadNum]', 1058 'buf[threadNum+_n]', sub, False) 1059 1060 current_version += """ 1061 __syncthreads(); 1062 } 1063 if (threadNum == 0) { 1064 %(z_pos)s = %(write_out)s(buf[0]); 1065 } 1066 """ 1067 1068 current_version = current_version % locals() 1069 1070 return current_version 1071 1072 # Threads must be organized as: threadNum%nb_reduce correspond to the same sum 1073 # nb_reduce<=warpSize 1074 def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce): 1075 reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False) 1076 write_out = write_w(node.outputs[0].dtype) 1077 1078 return """ 1079 __syncthreads(); // some kernel do multiple reduction. 1080 buf[threadNum] = myresult; 1081 __syncthreads(); 1082 1083 // rest of function is handled by one warp 1084 if (threadNum < %(nb_reduce)s) 1085 { 1086 //round up all the partial sums into the first `nb_reduce` elements 1087 for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s) 1088 { 1089 %(reduce_fct)s; 1090 } 1091 %(z_pos)s = %(write_out)s(myresult); 1092 } 1093 """ % locals() 1094 1095 def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): 1096 verbose = self.verbose 1097 in_dtype = "npy_" + node.inputs[0].dtype 1098 out_dtype = "npy_" + node.outputs[0].dtype 1099 if getattr(self.scalar_op, 'identity', None) == 0: 1100 zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals() 1101 # TODO: elif getattr(self.scalar_op, 'identity', None) == 1: 1102 else: 1103 zero_shp = """ 1104 PyErr_Format(PyExc_NotImplementedError, 1105 "GpuCAReduceCuda not implemented when input shape is 0 for this scalar_op"); 1106 %(fail)s; 1107 """ % locals() 1108 1109 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 1110 k_var = "kernel_reduce_ccontig_%(name)s" % locals() 1111 err_check = """ 1112 if (err != GA_NO_ERROR) { 1113 PyErr_Format(PyExc_RuntimeError, 1114 "gpuarray error: %(k_var)s: %%s.", 1115 GpuKernel_error(&%(k_var)s, err)); 1116 %(fail)s; 1117 } 1118 """ % locals() 1119 1120 print(""" 1121 { 1122 if(PyGpuArray_SIZE(%(x)s)==0){ 1123 %(zero_shp)s; 1124 }else{ 1125 int verbose = %(verbose)s; 1126 size_t numEls = PyGpuArray_SIZE(%(x)s); 1127 size_t n_threads = std::min(numEls, (size_t) 256); 1128 size_t n_blocks = 1; 1129 void *kernel_params[] = {(void *)&numEls, 1130 (void *)%(x)s->ga.data, 1131 (void *)&%(x)s->ga.offset, 1132 (void *)%(z)s->ga.data, 1133 (void *)&%(z)s->ga.offset}; 1134 if (verbose) printf("running kernel_reduce_ccontig_%(name)s" 1135 " n_threads=%%llu, size=%%llu, ndim=%%u\\n", 1136 n_threads, numEls, 1137 PyGpuArray_NDIM(%(x)s)); 1138 size_t n_shared = sizeof(%(acc_dtype)s) * n_threads; 1139 int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params); 1140 %(err_check)s 1141 } 1142 } 1143 """ % locals(), file=sio) 1144 1145 def c_code_reduce_1(self, sio, node, name, x, z, fail): 1146 verbose = self.verbose 1147 makecall = self._makecall(node, name, x, z, fail) 1148 print(""" 1149 { 1150 int verbose = %(verbose)s; 1151 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; 1152 size_t n_blocks[3] = {1, 1, 1}; 1153 %(makecall)s 1154 } 1155 """ % locals(), file=sio) 1156 1157 def c_code_reduce_11(self, sio, node, name, x, z, fail): 1158 verbose = self.verbose 1159 makecall = self._makecall(node, name, x, z, fail) 1160 print(""" 1161 { 1162 int verbose = %(verbose)s; 1163 1164 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1}; 1165 while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1]; 1166 n_threads[1] -= 1; 1167 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0]) 1168 n_threads[1] = PyGpuArray_DIMS(%(x)s)[0]; 1169 1170 size_t n_blocks[3] = {1, 1, 1}; 1171 %(makecall)s 1172 } 1173 """ % locals(), file=sio) 1174 1175 def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): 1176 """ 1177 1178 Parameters 1179 ---------- 1180 N 1181 The number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111 1182 Work for N=1,2,3. 1183 1184 """ 1185 1186 assert N in [1, 2, 3] 1187 verbose = self.verbose 1188 in_dtype = "npy_" + node.inputs[0].dtype 1189 out_dtype = "npy_" + node.outputs[0].dtype 1190 makecall = self._makecall(node, name, x, z, fail) 1191 N_pattern = ''.join(['1'] * N) 1192 param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i) 1193 for i in xrange(N + 1)]) 1194 strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)" 1195 % (x, i, in_dtype) for i in xrange(N + 1)]) 1196 1197 threads_y = """ 1198 //get as many y threads as we can fit 1199 while (n_threads[0] * (n_threads[1]+1) <= 256) 1200 { 1201 if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1]) 1202 n_threads[1] += 1; 1203 else 1204 break; 1205 }""" % locals() 1206 1207 threads_z = """ 1208 //get as many z threads as we can fit 1209 while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) 1210 { 1211 if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2]) 1212 n_threads[2] += 1; 1213 else 1214 break; 1215 } 1216 //Maximum for Fermi GPU on that dimensions. 1217 n_threads[2] = std::min(n_threads[2], (size_t)64); 1218 """ % locals() 1219 1220 if len(self.reduce_mask) == 2: 1221 threads_y = '' 1222 threads_z = '' 1223 1224 if len(self.reduce_mask) == 3: 1225 threads_z = '' 1226 1227 print(""" 1228 { 1229 int verbose = %(verbose)s; 1230 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1}; 1231 %(threads_y)s 1232 %(threads_z)s 1233 size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1}; 1234 %(makecall)s 1235 } 1236 """ % locals(), file=sio) 1237 1238 def c_code_reduce_01(self, sio, node, name, x, z, fail): 1239 self.c_code_reduce_01X(sio, node, name, x, z, fail, 1) 1240 1241 def c_code_reduce_011(self, sio, node, name, x, z, fail): 1242 self.c_code_reduce_01X(sio, node, name, x, z, fail, 2) 1243 1244 def c_code_reduce_0111(self, sio, node, name, x, z, fail): 1245 self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) 1246 1247 def c_code_reduce_10(self, sio, node, name, x, z, fail): 1248 verbose = self.verbose 1249 in_dtype = "npy_" + node.inputs[0].dtype 1250 out_dtype = "npy_" + node.outputs[0].dtype 1251 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 1252 k_var = "kernel_reduce_10_%(name)s" % locals() 1253 err_check = """ 1254 if (err != GA_NO_ERROR) { 1255 PyErr_Format(PyExc_RuntimeError, 1256 "gpuarray error: %(k_var)s: %%s.", 1257 GpuKernel_error(%(k_var)s, err)); 1258 %(fail)s; 1259 } 1260 """ % locals() 1261 1262 print(""" 1263 { 1264 int verbose = %(verbose)s; 1265 if(PyGpuArray_STRIDES(%(x)s)[0]> 1266 PyGpuArray_STRIDES(%(x)s)[1]){ 1267 // If there are a lot of summations to do, then we can use simple parallelization - 1268 // use each thread to do one sum. 1269 1270 // we might as well launch blocks of 32 threads because that's the warp size. 1271 // we could schedule more threads if we were maxing out the gridsize below, but 1272 // the gridsize is way more than the physical hardware and I think 32 threads 1273 // on a huge grid is enough to fully use the hardware. 1274 size_t n_threads[3] = {32, 1, 1}; 1275 1276 // We kindof reshape the input implicitly to something 4D: 1277 // the shape A,B,C -> A, B, D, E 1278 // where C <= D*E < C+32 1279 // where E==32 1280 1281 GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s; 1282 size_t A = 1; 1283 size_t B = PyGpuArray_DIMS(%(x)s)[0]; 1284 size_t C = PyGpuArray_DIMS(%(x)s)[1]; 1285 size_t D = C/32; 1286 if (32*D < C) D+= 1; 1287 assert ((C <= 32*D) && (32*D < C+32)); 1288 1289 // The gridsize would ideally be (A, D). But we do the following logic to make 1290 // sure we don't ask for a grid that is too big. 1291 size_t n_blocks[3] = {A, D, 1}; 1292 if (n_blocks[0] > 4096) n_blocks[0] = 4096; 1293 if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0]; 1294 ssize_t stride_A0 = 1; 1295 ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s); 1296 ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s); 1297 ssize_t stride_Z0 = 1; 1298 ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s); 1299 void *kernel_params[] = { 1300 (void *)&A, (void *)&B, (void *)&C, (void *)&D, 1301 (void *)%(x)s->ga.data, 1302 (void *)&%(x)s->ga.offset, 1303 (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, 1304 (void *)%(z)s->ga.data, 1305 (void *)&%(z)s->ga.offset, 1306 (void *)&stride_Z0, (void *)&stride_Z1}; 1307 int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); 1308 %(err_check)s 1309 }else{ 1310 GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s; 1311 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; 1312 size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1}; 1313 if (verbose) { 1314 fprintf(stderr, 1315 "running kernel_reduce_10_%(name)s n_blocks=(%%llu,%%llu)\\n", 1316 (unsigned long long)n_blocks[0], 1317 (unsigned long long)n_blocks[1]); 1318 } 1319 assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]); 1320 size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0]; 1321 size_t dim_0 = 1; 1322 ssize_t stride_A0 = 1; 1323 ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s); 1324 ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s); 1325 ssize_t stride_Z0 = 1; 1326 ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s); 1327 void *kernel_params[] = { 1328 (void *)&dim_0, 1329 (void *)&PyGpuArray_DIMS(%(x)s)[0], 1330 (void *)&PyGpuArray_DIMS(%(x)s)[1], 1331 (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset, 1332 (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, 1333 (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset, 1334 (void *)&stride_Z0, (void *)&stride_Z1}; 1335 int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params); 1336 %(err_check)s 1337 } 1338 } 1339 """ % locals(), file=sio) 1340 1341 def c_code_reduce_010(self, sio, node, name, x, z, fail): 1342 verbose = self.verbose 1343 makecall = self._makecall(node, name, x, z, fail) 1344 makecall_inner = self._makecall(node, name, x, z, fail, 1345 pattern="010_inner") 1346 pattern = ''.join(str(i) for i in self.reduce_mask) 1347 in_dtype = "npy_" + node.inputs[0].dtype 1348 out_dtype = "npy_" + node.outputs[0].dtype 1349 k_var = "kernel_reduce_010_AD_%(name)s" % locals() 1350 err_check = """ 1351 if (err != GA_NO_ERROR) { 1352 PyErr_Format(PyExc_RuntimeError, 1353 "gpuarray error: %(k_var)s: %%s.", 1354 GpuKernel_error(&%(k_var)s, err)); 1355 %(fail)s; 1356 } 1357 """ % locals() 1358 print(""" 1359 { 1360 //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2]; 1361 1362 //if ((n_summations >= 15 * 32) && (PyGpuArray_DIMS(%(x)s)[2]>=16)) 1363 if (1) // if the alternative is less buggy, consider not using this branch 1364 { 1365 // If there are a lot of summations to do, then we can use simple parallelization - 1366 // use each thread to do one sum. 1367 1368 // we might as well launch blocks of 32 threads because that's the warp size. 1369 // we could schedule more threads if we were maxing out the gridsize below, but 1370 // the gridsize is way more than the physical hardware and I think 32 threads 1371 // on a huge grid is enough to fully use the hardware. 1372 size_t n_threads[3] = {32, 1, 1}; 1373 1374 // We kindof reshape the input implicitly to something 4D: 1375 // the shape A,B,C -> A, B, D, E 1376 // where C <= D*E < C+32 1377 // where E==32 1378 1379 size_t A = PyGpuArray_DIMS(%(x)s)[0]; 1380 size_t B = PyGpuArray_DIMS(%(x)s)[1]; 1381 size_t C = PyGpuArray_DIMS(%(x)s)[2]; 1382 size_t D = C/32; 1383 if (32*D < C) D+= 1; 1384 assert ((C <= 32*D) && (32*D < C+32)); 1385 1386 // The gridsize would ideally be (A, D). But we do the following logic to make 1387 // sure we don't ask for a grid that is too big. 1388 size_t n_blocks[3] = {A, D, 1}; 1389 if (n_blocks[0] > 4096) n_blocks[0] = 4096; 1390 if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0]; 1391 ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s); 1392 ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s); 1393 ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s); 1394 ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s); 1395 ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s); 1396 void *kernel_params[] = { 1397 (void *)&A, (void *)&B, (void *)&C, (void *)&D, 1398 (void *)%(x)s->ga.data, 1399 (void *)&%(x)s->ga.offset, 1400 (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, 1401 (void *)%(z)s->ga.data, 1402 (void *)&%(z)s->ga.offset, 1403 (void *)&stride_Z0, (void *)&stride_Z1}; 1404 int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); 1405 %(err_check)s 1406 } 1407 else 1408 { 1409 int verbose = %(verbose)s; 1410 1411 size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1}; 1412 while( (n_threads[0]*(n_threads[1]+1)<=256) 1413 && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){ 1414 n_threads[1]++; 1415 } 1416 1417 size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1}; 1418 n_blocks[1] = std::min( 1419 ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], 1420 (size_t)n_threads[0]), 1421 (size_t)(4096 / n_blocks[0]) 1422 ); 1423 if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), 1424 PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)), 1425 PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)) 1426 ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s) 1427 && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], 1428 (size_t)n_threads[0])){ 1429 if(verbose>1) 1430 printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n", 1431 PyGpuArray_DIMS(%(x)s)[0],4096, 1432 ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]), 1433 (size_t)(4096 / n_blocks[0])); 1434 assert(n_threads[0]<=32); 1435 %(makecall_inner)s 1436 }else{ 1437 n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1], 1438 (size_t) 256); 1439 n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096); 1440 n_blocks[1] = std::min( 1441 PyGpuArray_DIMS(%(x)s)[2], 1442 (size_t)(4096 / n_blocks[0]) 1443 ); 1444 %(makecall)s 1445 } 1446 } 1447 } 1448 """ % locals(), file=sio) 1449 1450 def c_code_reduce_0101(self, sio, node, name, x, z, fail): 1451 verbose = self.verbose 1452 makecall = self._makecall(node, name, x, z, fail) 1453 print(""" 1454 { 1455 int verbose = %(verbose)s; 1456 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1}; 1457 while (n_threads[0] * n_threads[1] <= 256) 1458 { 1459 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break; 1460 n_threads[1] += 1; 1461 } 1462 n_threads[1] -= 1; 1463 size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1}; 1464 %(makecall)s 1465 } 1466 """ % locals(), file=sio) 1467 1468 def c_code_reduce_100(self, sio, node, name, x, z, fail): 1469 verbose = self.verbose 1470 makecall = self._makecall(node, name, x, z, fail) 1471 in_dtype = "npy_" + node.inputs[0].dtype 1472 out_dtype = "npy_" + node.outputs[0].dtype 1473 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 1474 k_var = "kernel_reduce_010_AD_%(name)s" % locals() 1475 err_check = """ 1476 if (err != GA_NO_ERROR) { 1477 PyErr_Format(PyExc_RuntimeError, 1478 "gpuarray error: %(k_var)s: %%s.", 1479 GpuKernel_error(&%(k_var)s, err)); 1480 %(fail)s; 1481 } 1482 """ % locals() 1483 # use threadIdx.x for i0 1484 # use blockIdx.x for i1 1485 # use blockIdx.y for i2 1486 print(""" 1487 { 1488 int verbose = %(verbose)s; 1489 if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){ 1490 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; 1491 size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1}; 1492 while (n_blocks[0] * (n_blocks[1]+1) <= 4096 && 1493 n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2]) 1494 { 1495 n_blocks[1] += 1; 1496 } 1497 %(makecall)s 1498 } 1499 else 1500 { // reuse 010_AD kernel, we transpose the 2 first dim 1501 // See the reduction for the real 010_AD kernel for 1502 // explanation. We do this to get coalesced read. 1503 size_t n_threads[3] = {32, 1, 1}; 1504 1505 size_t A = PyGpuArray_DIMS(%(x)s)[1]; 1506 size_t B = PyGpuArray_DIMS(%(x)s)[0]; 1507 size_t C = PyGpuArray_DIMS(%(x)s)[2]; 1508 size_t D = C/32; 1509 if (32*D < C) D+= 1; 1510 assert ((C <= 32*D) && (32*D < C+32)); 1511 1512 // The gridsize would ideally be (A, D). But we do the following logic to make 1513 // sure we don't ask for a grid that is too big. 1514 size_t n_blocks[3] = {A, D, 1}; 1515 if (n_blocks[0] > 4096) n_blocks[0] = 4096; 1516 if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0]; 1517 size_t n_shared = 0; 1518 ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s); 1519 ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s); 1520 ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s); 1521 ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s); 1522 ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s); 1523 void *kernel_params[] = { 1524 (void *)&A, (void *)&B, (void *)&C, (void *)&D, 1525 (void *)%(x)s->ga.data, 1526 (void *)&%(x)s->ga.offset, 1527 (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, 1528 (void *)%(z)s->ga.data, 1529 (void *)&%(z)s->ga.offset, 1530 (void *)&stride_Z0, (void *)&stride_Z1}; 1531 int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); 1532 %(err_check)s 1533 } 1534 } 1535 """ % locals(), file=sio) 1536 1537 def c_code_reduce_110(self, sio, node, name, x, z, fail): 1538 verbose = self.verbose 1539 makecall = self._makecall(node, name, x, z, fail) 1540 print(""" 1541 { 1542 int verbose = %(verbose)s; 1543 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1}; 1544 while (n_threads[0]*n_threads[1] <= 256) 1545 { 1546 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0]) 1547 break; 1548 n_threads[1] += 1; 1549 } 1550 n_threads[1] -= 1; 1551 1552 size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1}; 1553 %(makecall)s 1554 } 1555 """ % locals(), file=sio) 1556 1557 def c_code_reduce_001(self, sio, node, name, x, z, fail): 1558 verbose = self.verbose 1559 makecall = self._makecall(node, name, x, z, fail) 1560 print(""" 1561 { 1562 int verbose = %(verbose)s; 1563 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; 1564 size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1}; 1565 while (n_blocks[0] * n_blocks[1] <= 4096) 1566 { 1567 if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1]) 1568 break; 1569 n_blocks[1] += 1; 1570 } 1571 n_blocks[1] -= 1; 1572 %(makecall)s 1573 } 1574 """ % locals(), file=sio) 1575 1576 def c_code_reduce_101(self, sio, node, name, x, z, fail): 1577 verbose = self.verbose 1578 makecall = self._makecall(node, name, x, z, fail, 1579 extra_dims=[("size_t one = 1;", "(void *) &one")], 1580 extra_strides=[("ssize_t sone = 1;", "(void *) &sone")], 1581 pattern="1011") 1582 print(""" 1583 { 1584 int verbose = %(verbose)s; 1585// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], 1586// (size_t) 256), 1, 1}; 1587 size_t n_threads[3] = {1, 1, 1}; 1588 1589 while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1]; 1590 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2]) 1591 n_threads[1] = PyGpuArray_DIMS(%(x)s)[2]; 1592 1593 while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) 1594 ++n_threads[2]; 1595 if (n_threads[2] > 64) 1596 n_threads[2] = 64; 1597 if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0]) 1598 n_threads[2] = PyGpuArray_DIMS(%(x)s)[0]; 1599 1600 size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1}; 1601 %(makecall)s 1602 } 1603 """ % locals(), file=sio) 1604 1605 def c_code_reduce_111(self, sio, node, name, x, z, fail): 1606 verbose = self.verbose 1607 makecall = self._makecall(node, name, x, z, fail) 1608 print(""" 1609 { 1610 int verbose = %(verbose)s; 1611 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; 1612 1613 //get as many y threads as we can fit 1614 while (n_threads[0] * n_threads[1] <= 256) 1615 { 1616 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) 1617 break; 1618 n_threads[1] += 1; 1619 } 1620 n_threads[1] -= 1; 1621 1622 //get as many z threads as we can fit 1623 while (n_threads[0] * n_threads[1] * n_threads[2] <= 256) 1624 { 1625 if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0]) 1626 break; 1627 n_threads[2] += 1; 1628 } 1629 n_threads[2] -= 1; 1630 //Maximum for Fermi GPU on that dimensions. 1631 n_threads[2] = std::min(n_threads[2], (size_t)64); 1632 1633 size_t n_blocks[3] = {1, 1, 1}; 1634 %(makecall)s 1635 } 1636 """ % locals(), file=sio) 1637 1638 def c_code_reduce_0011(self, sio, node, name, x, z, fail): 1639 verbose = self.verbose 1640 makecall = self._makecall(node, name, x, z, fail) 1641 in_dtype = "npy_" + node.inputs[0].dtype 1642 out_dtype = "npy_" + node.outputs[0].dtype 1643 acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) 1644 print(""" 1645 { 1646 int verbose = %(verbose)s; 1647 1648 size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1}; 1649 1650 while (n_blocks[0] * n_blocks[1] <= 4096 && 1651 n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1]) 1652 { 1653 n_blocks[1] += 1; 1654 } 1655 1656 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1}; 1657 while (n_threads[0] * n_threads[1] <= 256 1658 && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2] 1659 && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200)) 1660 { 1661 n_threads[1] += 1; 1662 } 1663 1664 %(makecall)s 1665 } 1666 """ % locals(), file=sio) 1667 1668 def c_code_reduce_1111(self, sio, node, name, x, z, fail): 1669 verbose = self.verbose 1670 makecall = self._makecall(node, name, x, z, fail) 1671 print(""" 1672 { 1673 int verbose = %(verbose)s; 1674 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; 1675 1676 //get as many y threads as we can fit 1677 while (n_threads[0] * n_threads[1] <= 256) 1678 { 1679 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) 1680 break; 1681 n_threads[1] += 1; 1682 } 1683 n_threads[1] -= 1; 1684 1685 //get as many z threads as we can fit 1686 while (n_threads[0] * n_threads[1] * n_threads[2] <= 256) 1687 { 1688 if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0]) 1689 break; 1690 n_threads[2] += 1; 1691 } 1692 n_threads[2] -= 1; 1693 1694 //Maximum for Fermi GPU on that dimensions. 1695 n_threads[2] = std::min(n_threads[2], (size_t)64); 1696 1697 size_t n_blocks[3] = {1, 1, 1}; 1698 %(makecall)s 1699 } 1700 """ % locals(), file=sio) 1701 1702 def c_code_reduce_1011(self, sio, node, name, x, z, fail): 1703 verbose = self.verbose 1704 makecall = self._makecall(node, name, x, z, fail) 1705 print(""" 1706 { 1707 int verbose = %(verbose)s; 1708 size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1}; 1709 1710 while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1]; 1711 if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2]) 1712 n_threads[1] = PyGpuArray_DIMS(%(x)s)[2]; 1713 1714 while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2]; 1715 if (n_threads[2] > 64) 1716 n_threads[2] = 64; 1717 if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0]) 1718 n_threads[2] = PyGpuArray_DIMS(%(x)s)[0]; 1719 1720 size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1}; 1721 %(makecall)s 1722 } 1723 """ % locals(), file=sio) 1724 1725 def c_code_cache_version_apply(self, node): 1726 version = [24, self.verbose] # the version corresponding to the c code in this Op 1727 1728 # now we insert versions for the ops on which we depend... 1729 scalar_node = Apply( 1730 self.scalar_op, 1731 [Scalar(dtype=input.type.dtype)() for input in node.inputs], 1732 [Scalar(dtype=output.type.dtype)() for output in node.outputs]) 1733 version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node)) 1734 for i in node.inputs + node.outputs: 1735 version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) 1736 version.extend(self.kernel_version(node)) 1737 if all(version): 1738 return tuple(version) 1739 else: 1740 return () 1741 1742 def gpu_kernels(self, node, nodename): 1743 nd_in = len(self.reduce_mask) 1744 in_dtype = node.inputs[0].dtype 1745 out_dtype = node.outputs[0].dtype 1746 acc_dtype = self._acc_dtype(node.inputs[0].dtype) 1747 assign_dtype = in_dtype 1748 flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype) 1749 in_type = gpuarray.dtype_to_ctype(in_dtype) 1750 out_type = gpuarray.dtype_to_ctype(out_dtype) 1751 acc_type = gpuarray.dtype_to_ctype(acc_dtype) 1752 load_in = load_w(in_dtype) 1753 write_out = write_w(out_dtype) 1754 kernels = [] 1755 1756 if all(i == 1 for i in self.reduce_mask): 1757 # this kernel is ok for up to a few thousand elements, but 1758 # it only runs on ONE multiprocessor 1759 reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) 1760 reduce_fct = self._assign_reduce(node, nodename, "myresult", 1761 load_in + "(A[i0])", 1762 {}, True) 1763 reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype) 1764 kname = "kernel_reduce_ccontig" 1765 k_var = "kernel_reduce_ccontig_" + nodename 1766 sio = StringIO() 1767 print("""#include "cluda.h" 1768 1769 KERNEL void %(kname)s( 1770 const ga_size d0, 1771 const %(in_type)s *A, const ga_size offset_A, 1772 %(out_type)s *Z, const ga_size offset_Z) 1773 { 1774 const int threadCount = blockDim.x; 1775 const int threadNum = threadIdx.x; 1776 extern __shared__ %(acc_type)s buf[]; 1777 A = (const %(in_type)s *)(((char *)A)+offset_A); 1778 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 1779 %(acc_type)s myresult = %(reduce_init)s; 1780 1781 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) 1782 { 1783 %(reduce_fct)s 1784 } 1785 %(reducebuf)s 1786 } 1787 """ % locals(), file=sio) 1788 params = [ 1789 'uintp', 1790 gpuarray.GpuArray, 'uintp', 1791 gpuarray.GpuArray, 'uintp' 1792 ] 1793 kernels.append(Kernel(code=sio.getvalue(), name=kname, 1794 params=params, flags=flags, objvar=k_var)) 1795 if self.reduce_mask == (1,): 1796 # this kernel is ok for up to a few thousand elements, but 1797 # it only runs on ONE multiprocessor 1798 reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) 1799 reduce_fct = self._assign_reduce(node, nodename, "myresult", 1800 load_in + "(A[i0 * sA0])", 1801 {}, True) 1802 reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype) 1803 kname = "kernel_reduce_1" 1804 k_var = "kernel_reduce_1_" + nodename 1805 sio = StringIO() 1806 print("""#include "cluda.h" 1807 1808 KERNEL void %(kname)s( 1809 const ga_size d0, 1810 const %(in_type)s *A, const ga_size offset_A, 1811 const ga_ssize sA0, 1812 %(out_type)s * Z, const ga_size offset_Z) 1813 { 1814 const int threadCount = blockDim.x; 1815 const int threadNum = threadIdx.x; 1816 extern __shared__ %(acc_type)s buf[]; 1817 A = (const %(in_type)s *)(((char *)A)+offset_A); 1818 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 1819 %(acc_type)s myresult = %(reduce_init)s; 1820 1821 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) 1822 { 1823 %(reduce_fct)s 1824 } 1825 %(reducebuf)s 1826 } 1827 """ % locals(), file=sio) 1828 params = [ 1829 'uintp', 1830 gpuarray.GpuArray, 'uintp', 1831 'intp', 1832 gpuarray.GpuArray, 'uintp' 1833 ] 1834 kernels.append(Kernel(code=sio.getvalue(), name=kname, 1835 params=params, flags=flags, objvar=k_var)) 1836 if self.reduce_mask == (1, 1): 1837 # this kernel is ok for up to a few thousand elements, but 1838 # it only runs on ONE multiprocessor 1839 reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) 1840 reduce_fct = self._assign_reduce(node, nodename, "myresult", 1841 load_in + "(A[i0 * sA0 + i1 * sA1])", 1842 {}, True) 1843 reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype) 1844 kname = "kernel_reduce_11" 1845 k_var = "kernel_reduce_11_" + nodename 1846 sio = StringIO() 1847 print("""#include "cluda.h" 1848 1849 KERNEL void %(kname)s( 1850 const ga_size d0, const ga_size d1, 1851 const %(in_type)s *A, const ga_size offset_A, 1852 const ga_ssize sA0, const ga_ssize sA1, 1853 %(out_type)s * Z, const ga_size offset_Z) 1854 { 1855 const int threadCount = blockDim.x * blockDim.y; 1856 const int threadNum = threadIdx.y*blockDim.x + threadIdx.x; 1857 extern __shared__ %(acc_type)s buf[]; 1858 A = (const %(in_type)s *)(((char *)A)+offset_A); 1859 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 1860 %(acc_type)s myresult = %(reduce_init)s; 1861 1862 for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) 1863 { 1864 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) 1865 { 1866 %(reduce_fct)s; 1867 } 1868 } 1869 %(reducebuf)s 1870 } 1871 """ % locals(), file=sio) 1872 params = [ 1873 'uintp', 'uintp', 1874 gpuarray.GpuArray, 'uintp', 1875 'intp', 'intp', 1876 gpuarray.GpuArray, 'uintp' 1877 ] 1878 kernels.append(Kernel(code=sio.getvalue(), name=kname, 1879 params=params, flags=flags, objvar=k_var)) 1880 # 01, 011, 0111 1881 if (0 == self.reduce_mask[0] and 1882 all(self.reduce_mask[1:]) and 1883 nd_in in[2, 3, 4]): 1884 # this kernel uses one block for each row. 1885 # threads per block for each element per row. 1886 1887 N_pattern = ''.join(['1'] * (nd_in - 1)) 1888 # TODO: is it faster to hardcode sA3, etc. in the later 1889 # code, rather than have the for_* variables declare them 1890 # and the later code use their names? 1891 if nd_in == 2: 1892 for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)" 1893 first_i1 = 'threadIdx.x' 1894 sA1 = 'sA1' 1895 for_i2 = "int i2=0, sA2=0;" 1896 sA2 = '0' 1897 first_i2 = '0' 1898 for_i3 = "int i3=0, sA3=0;" 1899 sA3 = '0' 1900 first_i3 = '0' 1901 if nd_in == 3: 1902 for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)" 1903 first_i1 = 'threadIdx.y' 1904 sA1 = 'sA1' 1905 for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)" 1906 first_i2 = 'threadIdx.x' 1907 sA2 = 'sA2' 1908 for_i3 = "int i3=0, sA3=0;" 1909 first_i3 = 0 1910 sA3 = '0' 1911 if nd_in == 4: 1912 for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)" 1913 first_i1 = 'threadIdx.z' 1914 sA1 = 'sA1' 1915 for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" 1916 first_i2 = 'threadIdx.y' 1917 sA2 = 'sA2' 1918 for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" 1919 first_i3 = 'threadIdx.x' 1920 sA3 = 'sA3' 1921 1922 reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, 1923 nodename, sub={}) 1924 param_dim = ",".join(["const ga_size d%d" % i 1925 for i in xrange(nd_in)]) 1926 param_strides = ",".join(["const ga_ssize sA%d" % i 1927 for i in xrange(nd_in)]) 1928 decl, kname, params, k_var = self._k_decl(node, nodename) 1929 init = self._k_init(node, nodename) 1930 reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals(), assign_dtype) 1931 reduce_fct = self._assign_reduce( 1932 node, nodename, "myresult", 1933 load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", 1934 {}, True) 1935 sio = StringIO() 1936 print("""#include "cluda.h" 1937 1938 %(decl)s{ 1939 %(init)s 1940 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ 1941 myresult = %(reduce_init)s; 1942 %(for_i1)s{ 1943 %(for_i2)s{ 1944 %(for_i3)s{ 1945 %(reduce_fct)s; 1946 } 1947 } 1948 } 1949 %(reducebuf)s 1950 } 1951 } 1952 """ % locals(), file=sio) 1953 kernels.append(Kernel(code=sio.getvalue(), name=kname, 1954 params=params, flags=flags, objvar=k_var)) 1955 if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): 1956 # this kernel uses one block for each column, 1957 # threads per block for each element per column. 1958 1959 # TODO: This kernel is pretty inefficient in terms of reading, because if A is 1960 # c_contiguous (typical case) then each warp is accessing non-contigous 1961 # memory (a segment of a column). 1962 reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', 1963 node, nodename, sub={}) 1964 reduce_fct = self._assign_reduce(node, nodename, "myresult", 1965 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", 1966 {}, True) 1967 reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])", assign_dtype) 1968 kname = "kernel_reduce_010" 1969 k_var = "kernel_reduce_010_" + nodename 1970 sio = StringIO() 1971 print("""#include "cluda.h" 1972 1973 KERNEL void %(kname)s( 1974 const ga_size d0, const ga_size d1, const ga_size d2, 1975 const %(in_type)s *A, const ga_size offset_A, 1976 const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, 1977 %(out_type)s * Z, const ga_size offset_Z, 1978 const ga_ssize sZ0, const ga_ssize sZ1) 1979 { 1980 const int threadCount = blockDim.x; 1981 const int threadNum = threadIdx.x; 1982 extern __shared__ %(acc_type)s buf[]; 1983 A = (const %(in_type)s *)(((char *)A)+offset_A); 1984 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 1985 1986 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) 1987 { 1988 for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) 1989 { 1990 %(acc_type)s myresult = %(reduce_init)s; 1991 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) 1992 { 1993 %(reduce_fct)s; 1994 } 1995 %(reducebuf)s 1996 } 1997 } 1998 1999 } 2000 """ % locals(), file=sio) 2001 params = [ 2002 'uintp', 'uintp', 'uintp', 2003 gpuarray.GpuArray, 'uintp', 2004 'intp', 'intp', 'intp', 2005 gpuarray.GpuArray, 'uintp', 2006 'intp', 'intp' 2007 ] 2008 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2009 params=params, flags=flags, objvar=k_var)) 2010 if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]: 2011 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2012 load_in + "(X[a * sX0 + b * sX1 + c * sX2])", 2013 {}, True) 2014 reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])", assign_dtype) 2015 kname = "kernel_reduce_010_AD" 2016 k_var = "kernel_reduce_010_AD_" + nodename 2017 sio = StringIO() 2018 print("""#include "cluda.h" 2019 2020 KERNEL void %(kname)s( 2021 const ga_size A, const ga_size B, const ga_size C, const ga_size D, 2022 const %(in_type)s *X, const ga_size offset_X, 2023 const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2, 2024 %(out_type)s * Z, const ga_size offset_Z, 2025 const ga_ssize sZ0, const ga_ssize sZ1) 2026 { 2027 const int threadCount = blockDim.x; 2028 const int threadNum = threadIdx.x; 2029 X = (const %(in_type)s *)(((char *)X)+offset_X); 2030 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 2031 %(acc_type)s myresult = 0; 2032 2033 for (int a = blockIdx.x; a < A; a += gridDim.x) 2034 { 2035 for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y) 2036 { 2037 int c = i2_D * 32 + threadIdx.x; 2038 if (c < C) 2039 { 2040 myresult = %(reduce_init)s; 2041 for (int b = 0; b < B; ++b) 2042 { 2043 %(reduce_fct)s; 2044 } 2045 Z[a * sZ0 + c * sZ1] = %(write_out)s(myresult); 2046 } 2047 } 2048 } 2049 2050 } 2051 """ % locals(), file=sio) 2052 params = [ 2053 'uintp', 'uintp', 'uintp', 'uintp', 2054 gpuarray.GpuArray, 'uintp', 2055 'intp', 'intp', 'intp', 2056 gpuarray.GpuArray, 'uintp', 2057 'intp', 'intp' 2058 ] 2059 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2060 params=params, flags=flags, objvar=k_var)) 2061 if self.reduce_mask == (0, 1, 0): 2062 # 2063 # This kernel is optimized when the inner most dimensions 2064 # have the smallest stride. 2065 2066 # this kernel uses one block for multiple column(up to 32TODO), 2067 # threads per block for each element per column. 2068 2069 # thread.x = dim 2 contiguous 2070 # thread.y = dim 1 2071 # block.x = dim 0 2072 # block.y = dim 1 rest 2073 init = self._k_init(node, nodename) 2074 decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner") 2075 reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', 2076 node, nodename, 2077 'blockDim.x') 2078 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2079 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", 2080 {}, True) 2081 reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])", assign_dtype) 2082 sio = StringIO() 2083 print("""#include "cluda.h" 2084 2085 %(decl)s 2086 { 2087 %(init)s 2088 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) 2089 { 2090 for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x) 2091 { 2092 myresult = %(reduce_init)s; 2093 for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) 2094 { 2095 %(reduce_fct)s; 2096 } 2097 %(reducebuf)s 2098 } 2099 } 2100 } 2101 """ % locals(), file=sio) 2102 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2103 params=params, flags=flags, objvar=k_var)) 2104 if self.reduce_mask == (1, 1, 0): 2105 # this kernel uses one block for each column, 2106 # threads per block for each element per column. 2107 2108 # TODO: This kernel is pretty inefficient in terms of reading, because if A is 2109 # c_contiguous (typical case) then each warp is accessing non-contigous 2110 # memory (a segment of a column). 2111 reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub={}) 2112 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2113 load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])", 2114 {}, True) 2115 reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])", assign_dtype) 2116 kname = "kernel_reduce_110" 2117 k_var = "kernel_reduce_110_" + nodename 2118 sio = StringIO() 2119 print("""#include "cluda.h" 2120 2121 KERNEL void %(kname)s( 2122 const ga_size d0, const ga_size d1, const ga_size d2, 2123 const %(in_type)s *A, const ga_size offset_A, 2124 const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, 2125 %(out_type)s * Z, const ga_size offset_Z, 2126 const ga_ssize sZ0) 2127 { 2128 const int threadCount = blockDim.x * blockDim.y; 2129 const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; 2130 extern __shared__ %(acc_type)s buf[]; 2131 A = (const %(in_type)s *)(((char *)A)+offset_A); 2132 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 2133 %(acc_type)s myresult = %(reduce_init)s; 2134 2135 for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) 2136 { 2137 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) 2138 { 2139 %(reduce_fct)s; 2140 } 2141 } 2142 2143 %(reducebuf)s 2144 } 2145 """ % locals(), file=sio) 2146 params = [ 2147 'uintp', 'uintp', 'uintp', 2148 gpuarray.GpuArray, 'uintp', 2149 'intp', 'intp', 'intp', 2150 gpuarray.GpuArray, 'uintp', 2151 'intp' 2152 ] 2153 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2154 params=params, flags=flags, objvar=k_var)) 2155 if self.reduce_mask == (1, 0, 0): 2156 reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]', 2157 node, nodename, sub={}) 2158 decl, kname, params, k_var = self._k_decl(node, nodename) 2159 init = self._k_init(node, nodename) 2160 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2161 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", 2162 {}, True) 2163 reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])", assign_dtype) 2164 sio = StringIO() 2165 print("""#include "cluda.h" 2166 2167 %(decl)s 2168 { 2169 %(init)s 2170 for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) 2171 { 2172 for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x) 2173 { 2174 myresult = %(reduce_init)s; 2175 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) 2176 { 2177 %(reduce_fct)s 2178 } 2179 %(reducebuf)s 2180 } 2181 } 2182 } 2183 """ % locals(), file=sio) 2184 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2185 params=params, flags=flags, objvar=k_var)) 2186 if self.reduce_mask == (1, 1, 1): 2187 reducebuf = self._k_reduce_buf('Z[0]', node, 2188 nodename, sub={}) 2189 decl, kname, params, k_var = self._k_decl(node, nodename) 2190 init = self._k_init(node, nodename) 2191 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2192 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", 2193 {}, True) 2194 reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype) 2195 sio = StringIO() 2196 print("""#include "cluda.h" 2197 2198 %(decl)s 2199 { 2200 %(init)s 2201 myresult = %(reduce_init)s; 2202 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) 2203 { 2204 for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) 2205 { 2206 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) 2207 { 2208 %(reduce_fct)s; 2209 } 2210 } 2211 } 2212 %(reducebuf)s 2213 } 2214 """ % locals(), file=sio) 2215 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2216 params=params, flags=flags, objvar=k_var)) 2217 if self.reduce_mask == (0, 0, 1): 2218 # this kernel uses one block for each row, 2219 # threads per block for each element per row. 2220 reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', 2221 node, nodename, sub={}) 2222 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2223 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", 2224 {}, True) 2225 reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype) 2226 kname = "kernel_reduce_001" 2227 k_var = "kernel_reduce_001_" + nodename 2228 sio = StringIO() 2229 print("""#include "cluda.h" 2230 KERNEL void %(kname)s( 2231 const ga_size d0, const ga_size d1, const ga_size d2, 2232 const %(in_type)s *A, const ga_size offset_A, 2233 const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, 2234 %(out_type)s * Z, const ga_size offset_Z, 2235 const ga_ssize sZ0, const ga_ssize sZ1) 2236 { 2237 const int threadCount = blockDim.x; 2238 const int threadNum = threadIdx.x; 2239 extern __shared__ %(acc_type)s buf[]; 2240 A = (const %(in_type)s *)(((char *)A)+offset_A); 2241 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 2242 2243 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) 2244 { 2245 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) 2246 { 2247 %(acc_type)s myresult = %(reduce_init)s; 2248 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) 2249 { 2250 %(reduce_fct)s; 2251 } 2252 %(reducebuf)s 2253 } 2254 } 2255 } 2256 """ % locals(), file=sio) 2257 params = [ 2258 'uintp', 'uintp', 'uintp', 2259 gpuarray.GpuArray, 'uintp', 2260 'intp', 'intp', 'intp', 2261 gpuarray.GpuArray, 'uintp', 2262 'intp', 'intp' 2263 ] 2264 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2265 params=params, flags=flags, objvar=k_var)) 2266 if self.reduce_mask == (0, 0, 1, 1): 2267 # this kernel uses one block for each row, 2268 # threads per block for each element per row. 2269 reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', 2270 node, nodename, sub={}) 2271 decl, kname, params, k_var = self._k_decl(node, nodename) 2272 init = self._k_init(node, nodename) 2273 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2274 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", 2275 {}, True) 2276 reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype) 2277 sio = StringIO() 2278 print("""#include "cluda.h" 2279 2280 %(decl)s 2281 { 2282 %(init)s 2283 2284 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) 2285 { 2286 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) 2287 { 2288 %(acc_type)s myresult = %(reduce_init)s; 2289 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) 2290 { 2291 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) 2292 { 2293 %(reduce_fct)s; 2294 } 2295 } 2296 %(reducebuf)s 2297 } 2298 } 2299 } 2300 """ % locals(), file=sio) 2301 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2302 params=params, flags=flags, objvar=k_var)) 2303 if self.reduce_mask == (0, 1, 0, 1): 2304 # this kernel uses one block for each row, 2305 # threads per block for each element per row. 2306 reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]', 2307 node, nodename, sub={}) 2308 decl, kname, params, k_var = self._k_decl(node, nodename) 2309 init = self._k_init(node, nodename) 2310 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2311 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", 2312 {}, True) 2313 reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])", assign_dtype) 2314 sio = StringIO() 2315 print("""#include "cluda.h" 2316 2317 %(decl)s 2318 { 2319 %(init)s 2320 2321 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) 2322 { 2323 for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) 2324 { 2325 %(acc_type)s myresult = %(reduce_init)s; 2326 for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) 2327 { 2328 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) 2329 { 2330 %(reduce_fct)s; 2331 } 2332 } 2333 %(reducebuf)s 2334 } 2335 } 2336 } 2337 """ % locals(), file=sio) 2338 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2339 params=params, flags=flags, objvar=k_var)) 2340 if self.reduce_mask == (1, 1, 1, 1): 2341 reducebuf = self._k_reduce_buf('Z[0]', node, nodename, 2342 sub={}) 2343 decl, kname, params, k_var = self._k_decl(node, nodename) 2344 init = self._k_init(node, nodename) 2345 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2346 load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", 2347 {}, True) 2348 reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype) 2349 sio = StringIO() 2350 print("""#include "cluda.h" 2351 2352 %(decl)s 2353 { 2354 %(init)s 2355 myresult = %(reduce_init)s; 2356 for (int i0 = 0; i0 < d0; i0++) 2357 for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z) 2358 { 2359 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) 2360 { 2361 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) 2362 { 2363 %(reduce_fct)s; 2364 } 2365 } 2366 } 2367 %(reducebuf)s 2368 } 2369 """ % locals(), file=sio) 2370 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2371 params=params, flags=flags, objvar=k_var)) 2372 if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1): 2373 reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', 2374 node, nodename, sub={}) 2375 reduce_fct = self._assign_reduce(node, nodename, "myresult", 2376 load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])", 2377 {}, True) 2378 reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])", assign_dtype) 2379 kname = "kernel_reduce_1011" 2380 k_var = "kernel_reduce_1011_" + nodename 2381 sio = StringIO() 2382 print("""#include "cluda.h" 2383 2384 KERNEL void %(kname)s( 2385 const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3, 2386 const %(in_type)s *A, const ga_size offset_A, 2387 const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3, 2388 %(out_type)s * Z, const ga_size offset_Z, 2389 const ga_ssize sZ0) 2390 { 2391 const int threadCount = blockDim.x * blockDim.y * blockDim.z; 2392 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 2393 extern __shared__ %(acc_type)s buf[]; 2394 A = (const %(in_type)s *)(((char *)A)+offset_A); 2395 Z = (%(out_type)s *)(((char *)Z)+offset_Z); 2396 %(acc_type)s myresult = %(reduce_init)s; 2397 2398 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) 2399 { 2400 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) 2401 { 2402 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) 2403 { 2404 %(reduce_fct)s; 2405 } 2406 } 2407 } 2408 %(reducebuf)s 2409 } 2410 """ % locals(), file=sio) 2411 params = [ 2412 'uintp', 'uintp', 'uintp', 'uintp', 2413 gpuarray.GpuArray, 'uintp', 2414 'intp', 'intp', 'intp', 'intp', 2415 gpuarray.GpuArray, 'uintp', 2416 'intp' 2417 ] 2418 kernels.append(Kernel(code=sio.getvalue(), name=kname, 2419 params=params, flags=flags, objvar=k_var)) 2420 return kernels 2421 2422 2423class GpuErfinv(Erfinv): 2424 """ 2425 Inverse error function for GPU. 2426 2427 """ 2428 2429 def c_headers(self): 2430 return ['math_functions.h', 'cublas_v2.h'] 2431 2432 def c_code(self, node, name, inp, out, sub): 2433 x, = inp 2434 z, = out 2435 if node.inputs[0].type in complex_types: 2436 raise NotImplementedError('type not supported', type) 2437 # NB: CUDA erfinv function (GPU op) returns NaN if x not in [-1;1], 2438 # while `scipy.special.erfinv` (CPU op) returns an infinite (-inf if x < -1, +inf if x > 1). 2439 # For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions 2440 # to ensure that GPU op returns the same values as CPU op. 2441 return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals() 2442gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv') 2443 2444 2445class GpuErfcinv(Erfcinv): 2446 """ 2447 Inverse complementary error function for GPU. 2448 2449 """ 2450 2451 def c_headers(self): 2452 return ['math_functions.h', 'cublas_v2.h'] 2453 2454 def c_code(self, node, name, inp, out, sub): 2455 x, = inp 2456 z, = out 2457 if node.inputs[0].type in complex_types: 2458 raise NotImplementedError('type not supported', type) 2459 # NB: CUDA erfcinv function (GPU op) returns NaN if x not in [0;2], 2460 # while `scipy.special.erfcinv` (CPU op) returns an infinite (+inf if x < 0, -inf if x > 2). 2461 # For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions 2462 # to ensure that GPU op returns the same values as CPU op. 2463 return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals() 2464gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv') 2465 2466 2467# Caching GpuCAReduceCuda 2468def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None, 2469 pre_scalar_op=None): 2470 key = (scalar_op, axis, reduce_mask, dtype, acc_dtype, 2471 pre_scalar_op) 2472 if key not in gpu_ca_reduce_cuda.cache: 2473 gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype, 2474 acc_dtype, pre_scalar_op) 2475 return gpu_ca_reduce_cuda.cache[key] 2476gpu_ca_reduce_cuda.cache = {} 2477 2478 2479class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): 2480 """ 2481 CAReduce that reuse the python code from gpuarray. 2482 2483 """ 2484 def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None): 2485 if not hasattr(scalar_op, 'identity'): 2486 raise ValueError("No identity on scalar op") 2487 CAReduceDtype.__init__(self, scalar_op, axis=axis, dtype=dtype, 2488 acc_dtype=acc_dtype) 2489 2490 def __str__(self): 2491 ax = '' 2492 if self.axis is not None: 2493 ax = '{%s}' % (', '.join(str(x) for x in self.axis),) 2494 return "GpuReduce{%s}%s" % (self.scalar_op, ax) 2495 2496 def make_node(self, input): 2497 ctx_name = infer_context_name(input) 2498 res = CAReduceDtype.make_node(self, input) 2499 input = as_gpuarray_variable(input, ctx_name) 2500 otype = GpuArrayType(dtype=res.outputs[0].dtype, 2501 broadcastable=res.outputs[0].broadcastable, 2502 context_name=ctx_name) 2503 2504 if res.op.axis is not None: 2505 redux = [] 2506 for i in range(len(input.type.broadcastable)): 2507 redux.append(i in res.op.axis) 2508 # since redux is just another way to describe what is in axis 2509 # it doesn't need to be compared in __eq__ or __hash__ 2510 res.op.redux = redux 2511 2512 return Apply(res.op, [input], [otype()]) 2513 2514 def get_params(self, node): 2515 return node.outputs[0].type.context 2516 2517 def prepare_node(self, node, storage_map, compute_map, impl): 2518 # cache the kernel object 2519 self.get_kernel_cache(node) 2520 2521 def get_kernel_cache(self, node): 2522 attr = '@cache_reduction_k' 2523 if self.axis is None: 2524 redux = [True] * node.inputs[0].ndim 2525 else: 2526 redux = self.redux 2527 if not hasattr(node, attr): 2528 acc_dtype = getattr(self, 'acc_dtype', None) 2529 if acc_dtype is None: 2530 acc_dtype = node.outputs[0].type.dtype 2531 if any(redux): 2532 setattr(node, attr, self.generate_kernel(node, acc_dtype, 2533 redux)) 2534 2535 if any(redux): 2536 return getattr(node, attr) 2537 2538 def gpu_kernels(self, node, name): 2539 if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): 2540 # Some OpenCL compilers do not accept no-arguments empty kernels 2541 src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }" 2542 params = ['float32'] 2543 else: 2544 k = self.get_kernel_cache(node) 2545 _, src, _, _ = k._get_basic_kernel(k.init_local_size, 2546 node.inputs[0].ndim) 2547 nd = node.inputs[0].ndim 2548 params = ['uint32', gpuarray.GpuArray, 'uint32'] 2549 params.extend('uint32' for _ in range(nd)) 2550 params.append(gpuarray.GpuArray) 2551 params.append('uint32') 2552 params.extend('int32' for _ in range(nd)) 2553 acc_dtype = getattr(self, 'acc_dtype', None) 2554 if acc_dtype is None: 2555 acc_dtype = node.outputs[0].type.dtype 2556 return [Kernel(code=src, name="reduk", params=params, 2557 flags=Kernel.get_flags(node.inputs[0].type.dtype, 2558 acc_dtype, 2559 node.outputs[0].type.dtype), 2560 objvar='k_reduk_' + name)] 2561 2562 def c_code(self, node, name, inp, out, sub): 2563 if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): 2564 # We special case the no-reduction case since the gpu 2565 # kernel has trouble handling it. 2566 return """ 2567 Py_XDECREF(%(out)s); 2568 %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER); 2569 if (!%(out)s) { 2570 %(fail)s 2571 } 2572 2573 """ % dict(out=out[0], inp=inp[0], fail=sub['fail']) 2574 k = self.get_kernel_cache(node) 2575 _, src, _, ls = k._get_basic_kernel(k.init_local_size, 2576 node.inputs[0].ndim) 2577 if self.axis is None: 2578 redux = [True] * node.inputs[0].ndim 2579 else: 2580 redux = self.redux 2581 acc_dtype = getattr(self, 'acc_dtype', None) 2582 if acc_dtype is None: 2583 acc_dtype = node.outputs[0].type.dtype 2584 input = inp[0] 2585 output = out[0] 2586 nd_out = node.outputs[0].ndim 2587 code = """ 2588 size_t gs = 1; 2589 size_t ls; 2590 unsigned int n = 1; 2591 unsigned int proxy_dim[%(nd_in)s]; 2592 unsigned int proxy_off; 2593 int proxy_str[%(nd_in)s]; 2594 void *args[%(n_args)s]; 2595 PyGpuArrayObject *tmp; 2596 int err; 2597""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim) 2598 2599 if nd_out != 0: 2600 code += """ 2601 size_t out_dims[%(nd_out)s]; 2602 int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s; 2603""" % dict(nd_out=nd_out, output=output) 2604 j = 0 2605 for i in range(node.inputs[0].ndim): 2606 if not self.redux[i]: 2607 code += """ 2608 out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s]; 2609 if (!need_out) 2610 need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s]; 2611""" % dict(j=j, i=i, input=input, output=output) 2612 j += 1 2613 code += """ 2614 if (need_out) { 2615 %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None); 2616 if (!%(output)s) { 2617 %(fail)s 2618 } 2619 } 2620 """ % dict(output=output, nd_out=nd_out, fail=sub['fail'], 2621 ctx=sub['params'], 2622 out_type=dtype_to_typecode(node.outputs[0].type.dtype)) 2623 else: 2624 code += """ 2625 if (%(output)s == NULL || %(output)s->ga.nd != 0) { 2626 Py_XDECREF(%(output)s); 2627 %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER, 2628 %(ctx)s, Py_None); 2629 if (!%(output)s) { 2630 %(fail)s 2631 } 2632 } 2633 """ % dict(output=output, fail=sub['fail'], ctx=sub['params'], 2634 out_type=dtype_to_typecode(node.outputs[0].type.dtype)) 2635 2636 if acc_dtype != node.outputs[0].type.dtype: 2637 code += """ 2638 tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions, 2639 %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None); 2640 if (!tmp) %(fail)s 2641 """ % dict(output=output, fail=sub['fail'], ctx=sub['params'], 2642 acc_type=dtype_to_typecode(acc_dtype)) 2643 else: 2644 code += """ 2645 tmp = %(output)s; 2646 Py_INCREF(tmp); 2647 """ % dict(output=output) 2648 2649 # We need the proxies since we are passing a pointer to the 2650 # data into the call and therefore we need a real copy of the 2651 # data in the proper type. 2652 code += """ 2653 args[0] = &n; 2654 args[1] = tmp->ga.data; 2655 args[2] = &tmp->ga.offset; 2656 """ % dict(output=output) 2657 2658 p = 3 2659 for i in range(node.inputs[0].ndim): 2660 code += """ 2661 proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s]; 2662 args[%(p)s] = &proxy_dim[%(i)s]; 2663 n *= %(input)s->ga.dimensions[%(i)s]; 2664 """ % dict(i=i, p=p, input=input) 2665 p += 1 2666 if not redux[i]: 2667 code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i) 2668 2669 code += """ 2670 args[%(p)s] = %(input)s->ga.data; 2671 proxy_off = %(input)s->ga.offset; 2672 args[%(p)s+1] = &proxy_off; 2673 """ % dict(p=p, input=input) 2674 p += 2 2675 2676 for i in range(node.inputs[0].ndim): 2677 code += """ 2678 proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s]; 2679 args[%(p)s] = &proxy_str[%(i)s]; 2680 """ % dict(p=p, i=i, input=input) 2681 p += 1 2682 2683 code += """ 2684 if (gs == 0) gs = 1; 2685 n /= gs; 2686 ls = %(ls)s; 2687 err = GpuKernel_call(&%(k_var)s, 1, &gs, &ls, 0, args); 2688 if (err != GA_NO_ERROR) { 2689 PyErr_Format(PyExc_RuntimeError, 2690 "gpuarray error: GpuCAReduceCPY: %%s.", 2691 GpuKernel_error(&%(k_var)s, err)); 2692 %(fail)s 2693 } 2694 2695 if (%(cast_out)d) { 2696 err = GpuArray_move(&%(output)s->ga, &tmp->ga); 2697 Py_XDECREF(tmp); 2698 if (err != GA_NO_ERROR) { 2699 PyErr_Format(PyExc_RuntimeError, 2700 "gpuarray error: GpuCAReduceCPY [cast]: %%s.", 2701 GpuArray_error(&tmp->ga, err)); 2702 %(fail)s 2703 } 2704 } else { 2705 Py_XDECREF(%(output)s); 2706 %(output)s = tmp; 2707 } 2708 2709 """ % dict(k_var='k_reduk_' + name, 2710 ls=ls, fail=sub['fail'], output=output, input=input, 2711 cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) 2712 2713 return code 2714 2715 def c_code_cache_version_apply(self, node): 2716 return (4, self.kernel_version(node)) 2717 2718 def generate_kernel(self, node, odtype, redux): 2719 if isinstance(self.scalar_op, scalar.basic.Add): 2720 reduce_expr = "a + b" 2721 elif isinstance(self.scalar_op, scalar.basic.Mul): 2722 reduce_expr = "a * b" 2723 else: 2724 raise NotImplementedError() 2725 return ReductionKernel(node.inputs[0].type.context, odtype, 2726 self.scalar_op.identity, reduce_expr, redux, 2727 arguments=[make_argument(node.inputs[0], 'a')], 2728 init_nd=node.inputs[0].ndim) 2729 2730 def perform(self, node, inp, out, ctx): 2731 input, = inp 2732 output, = out 2733 2734 if self.axis is None: 2735 redux = [True] * input.ndim 2736 else: 2737 redux = self.redux 2738 2739 if any(redux): 2740 output[0] = self.get_kernel_cache(node)(input).astype( 2741 copy=False, dtype=node.outputs[0].type.dtype) 2742 else: 2743 output[0] = pygpu.gpuarray.array(input, copy=True, 2744 dtype=node.outputs[0].type.dtype, 2745 context=ctx) 2746# To allow reloading old pickled files 2747GpuCAReduce = GpuCAReduceCPY 2748