1from __future__ import absolute_import, print_function, division 2import os 3import copy 4import re 5import numpy as np 6 7import theano 8from theano import Op, Apply, Type, Variable 9from theano import tensor, config 10from theano.gradient import grad_undefined 11from theano.scalar import (bool as bool_t, 12 int32 as int32_t) 13from theano.tensor.basic import ( 14 Alloc, AllocEmpty, alloc_validate_shape, Join, Split) 15 16from theano.gof import HideC, COp, ParamsType 17from theano.gof.utils import MethodNotDefined 18from theano.gof.opt import copy_stack_trace 19 20from collections import deque 21 22from six import string_types 23from six.moves import xrange 24from six import iteritems 25 26try: 27 import pygpu 28 from pygpu import gpuarray 29except ImportError: 30 pass 31 32from .type import (GpuArrayType, GpuArrayConstant, gpu_context_type, 33 get_context, ContextNotDefined, EQ_MAP) 34from .fp16_help import write_w 35 36 37def as_gpuarray_variable(x, context_name): 38 """ 39 This will attempt to convert `x` into a variable on the GPU. 40 41 It can take either a value of another variable. If `x` is already 42 suitable, it will be returned as-is. 43 44 Parameters 45 ---------- 46 x 47 Object to convert 48 context_name : str or None 49 target context name for the result 50 51 """ 52 # If this is already some form of variable, try to avoid an extra transfer 53 if isinstance(x, Variable): 54 while True: 55 # If we are already a GpuArrayVariable in the right context 56 # then there is nothing to do. 57 if (isinstance(x.type, GpuArrayType) and 58 x.type.context_name == context_name): 59 return x 60 61 # If x is the result of a transfer, try to dig through. 62 if getattr(x, 'owner', None): 63 if isinstance(x.owner.op, HostFromGpu): 64 x = x.owner.inputs[0] 65 continue 66 if isinstance(x.owner.op, GpuFromHost): 67 x = x.owner.inputs[0] 68 continue 69 if isinstance(x.owner.op, GpuToGpu): 70 x = x.owner.inputs[0] 71 continue 72 73 # If none of the conditions where met, then continue with 74 # the rest of the body 75 break 76 77 # If we couldn't deal with transfers, then maybe it's a tensor 78 if isinstance(x.type, tensor.TensorType): 79 return copy_stack_trace(x, GpuFromHost(context_name)(x)) 80 81 # Try _as_GpuArrayVariable if possible 82 if hasattr(x, '_as_GpuArrayVariable'): 83 return copy_stack_trace(x, x._as_GpuArrayVariable(context_name)) 84 85 # If it didn't work try for a constant 86 ctx = get_context(context_name) 87 88 if isinstance(x, gpuarray.GpuArray): 89 if x.context.ptr != ctx.ptr: 90 x = x.transfer(ctx) 91 92 x = gpuarray.asarray(x, context=ctx) 93 94 bcast = [(s == 1) for s in x.shape] 95 return GpuArrayConstant(GpuArrayType(dtype=x.dtype, 96 broadcastable=bcast, 97 context_name=context_name), 98 x) 99 100 101def infer_context_name(*vars): 102 """ 103 Infer the context name to use from the inputs given 104 105 """ 106 # We try to infer the closest context first 107 # TODO: What to do in case of context conflicts? 108 # We currently use a first found wins approach. 109 todo = deque() 110 todo.extendleft(vars) 111 while todo: 112 v = todo.pop() 113 if isinstance(v.type, GpuArrayType): 114 return v.type.context_name 115 if hasattr(v.tag, 'context_name'): 116 return v.tag.context_name 117 if v.owner: 118 if isinstance(v.owner.op, HostFromGpu): 119 return v.owner.inputs[0].type.context_name 120 if len(v.owner.inputs) == 1: 121 todo.extendleft(v.owner.inputs) 122 # If we can't find a context try None if it exists 123 try: 124 get_context(None) 125 return None 126 except ContextNotDefined: 127 raise ValueError("Could not infer context from inputs") 128 129 130def gpuarray_helper_inc_dir(): 131 return os.path.join(os.path.dirname(__file__), 'c_code') 132 133 134class Kernel(object): 135 """ 136 This class groups together all the attributes of a gpu kernel. 137 138 `params` should contain the data type for each argument. Buffer 139 arguments should use the GpuArray class as the data type and 140 scalar should use their equivalent numpy dtype. For ga_size and 141 ga_ssize, use gpuarray.SIZE and gpuarray.SSIZE. 142 143 If the `ctypes` flags is set to `True` then it should be a C 144 string which represent the typecode to use. 145 146 `flags` can contain the following keys whose values are booleans: 147 148 have_double 149 the kernel uses double-typed variables somewhere 150 have_small 151 the kernel uses variables whose type takes less than 4 152 bytes somewhere 153 have_complex 154 the kernel uses complex values somewhere 155 have_half 156 the kernel uses half-floats somewhere 157 ctypes 158 the `params` list consists of C typecodes 159 160 It can also have the key `cflags` which is a string of C flag 161 values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`. 162 163 Parameters 164 ---------- 165 code: str 166 The source code of the kernel. 167 params: list 168 list of parameter types. 169 name: str 170 the name of the kernel function in the source. 171 flags: dict 172 dictionary of flags 173 codevar: str 174 the name of the variable for the code object. 175 (defaults to `kcode_` + name) 176 objvar: str 177 the name of the variable for the kernel object. 178 (defaults to `k_` + name) 179 fname: str 180 the name of the function wrapper. 181 (defaults to name + `_call`) 182 sname: str 183 the name of the scheduled call function 184 (defaults to name _ `_scall`) 185 186 """ 187 188 def __init__(self, code, params, name, flags, 189 codevar=None, objvar=None, fname=None, sname=None): 190 self.code = code 191 self.params = params 192 self.name = name 193 self.flags = flags 194 if codevar is None: 195 codevar = 'kcode_' + name 196 self.codevar = codevar 197 if objvar is None: 198 objvar = 'k_' + name 199 self.objvar = objvar 200 if fname is None: 201 fname = name + '_call' 202 self.fname = fname 203 if sname is None: 204 sname = name + '_scall' 205 self.sname = sname 206 207 @staticmethod 208 def get_flags(*types): 209 def get_dtype(t): 210 if isinstance(t, string_types): 211 return np.dtype(t) 212 elif isinstance(t, Type): 213 return t.dtype 214 elif isinstance(t, Variable): 215 return t.type.dtype 216 else: 217 raise TypeError("can't get a dtype from %s" % (type(t),)) 218 dtypes = [get_dtype(t) for t in types] 219 flags = dict() 220 if any(d == np.float64 for d in dtypes): 221 flags['have_double'] = True 222 if any(d.itemsize < 4 for d in dtypes): 223 flags['have_small'] = True 224 if any(d.kind == 'c' for d in dtypes): 225 flags['have_complex'] = True 226 if any(d == np.float16 for d in dtypes): 227 flags['have_half'] = True 228 return flags 229 230 def _get_c_flags(self): 231 res = [] 232 if self.flags.get('cflags', '') != '': 233 res.append(self.flags['cflags']) 234 if self.flags.get('have_double', False): 235 res.append('GA_USE_DOUBLE') 236 if self.flags.get('have_small', False): 237 res.append('GA_USE_SMALL') 238 if self.flags.get('have_complex', False): 239 res.append('GA_USE_COMPLEX') 240 if self.flags.get('have_half', False): 241 res.append('GA_USE_HALF') 242 res = '|'.join(res) 243 if not res: 244 return '0' 245 return res 246 247 def _get_py_flags(self): 248 res = dict(self.flags) 249 cflags = res.pop('cflags', '') 250 for fl in cflags.split('|'): 251 fl = fl.strip() 252 if fl == 'GA_USE_DOUBLE': 253 res['have_double'] = True 254 if fl == 'GA_USE_SMALL': 255 res['have_small'] = True 256 if fl == 'GA_USE_COMPLEX': 257 res['have_complex'] = True 258 if fl == 'GA_USE_HALF': 259 res['have_half'] = True 260 return res 261 262 def _get_c_types(self): 263 def m(t): 264 if t == gpuarray.GpuArray: 265 return "GA_BUFFER" 266 else: 267 return str(gpuarray.dtype_to_typecode(t)) 268 return ', '.join(m(t) for t in self.params) 269 270 271def get_ctype(dtype): 272 if dtype is gpuarray.GpuArray: 273 return "gpudata *" 274 elif isinstance(dtype, np.dtype): 275 return 'npy_' + dtype.name 276 elif dtype == gpuarray.SIZE: 277 return "size_t" 278 elif dtype == gpuarray.SSIZE: 279 return "ssize_t" 280 else: 281 dtype = np.dtype(dtype) 282 return 'npy_' + dtype.name 283 284 285class GpuKernelBase(object): 286 """ 287 Base class for operations that need to compile kernels. 288 289 It is not mandatory to use this class, but it helps with a lot of 290 the small things that you have to pay attention to. 291 292 """ 293 params_type = gpu_context_type 294 295 def get_params(self, node): 296 # Default implementation, suitable for most sub-classes. 297 # To be necessarly overridden in a subclass that uses a ParamsType. 298 assert (self.params_type is gpu_context_type and 299 node.inputs and 300 isinstance(node.inputs[0].type, GpuArrayType)) 301 return node.inputs[0].type.context 302 303 def get_gpu_context(self, node): 304 # Private method used to retrieve GPU context, instead of 305 # directly using self.get_params(node), as this latter may be overridden. 306 if isinstance(self.params_type, ParamsType) and self.params_type.has_type(gpu_context_type): 307 # Get field name of gpu_context_type into ParamsType object. 308 gpu_context_field = self.params_type.get_field(gpu_context_type) 309 # Get Params object (self.get_params() should have been overridden). 310 wrap = self.get_params(node) 311 # Get GPU context from Params object. 312 return getattr(wrap, gpu_context_field) 313 assert self.params_type is gpu_context_type 314 return self.get_params(node) 315 316 def get_gpu_context_c_name(self, params_c_name): 317 # Private method used to retrieve C name of GPU context variable, 318 # instead of directly using sub['params'], as params may not be a GPU context 319 # (e.g. for sub-classes that use ParamsType). 320 if isinstance(self.params_type, ParamsType) and self.params_type.has_type(gpu_context_type): 321 return "(%s->%s)" % (params_c_name, self.params_type.get_field(gpu_context_type)) 322 assert self.params_type is gpu_context_type 323 return params_c_name 324 325 def gpu_kernels(self, node, name): 326 """ 327 This is the method to override. This should return an iterable 328 of Kernel objects that describe the kernels this op will need. 329 330 """ 331 raise MethodNotDefined('gpu_kernels') 332 333 def c_headers(self): 334 try: 335 o = super(GpuKernelBase, self).c_headers() 336 except MethodNotDefined: 337 o = [] 338 return o + ['gpuarray/types.h', 'numpy/npy_common.h'] 339 340 def c_header_dirs(self): 341 try: 342 o = super(GpuKernelBase, self).c_header_dirs() 343 except MethodNotDefined: 344 o = [] 345 # We rely on the input types for the directory to gpuarray includes 346 return o + [np.get_include()] 347 348 def _generate_kernel_code(self, k): 349 code = '\\n'.join(l for l in k.code.split('\n')) 350 code = code.replace('"', '\\"') 351 return ("""static const char *%(cname)s_unsigned = "%(code)s"; 352 static const char *%(cname)s = (char *)%(cname)s_unsigned; 353 """ % 354 dict(cname=k.codevar, code=code)) 355 356 def _generate_kernel_vars(self, k): 357 return """GpuKernel %(kname)s;""" % dict(kname=k.objvar) 358 359 def _generate_kernel_wrap(self, k): 360 args = [] 361 setargs = [] 362 for i, p in enumerate(k.params): 363 args.append("{} arg{}".format(get_ctype(p), i)) 364 if p is gpuarray.GpuArray: 365 setarg = "GpuKernel_setarg(&{0}, {1}, arg{1});" 366 else: 367 setarg = "GpuKernel_setarg(&{0}, {1}, &arg{1});" 368 setargs.append(setarg.format(k.objvar, i)) 369 370 args = ', '.join(args) 371 setargs = '\n '.join(setargs) 372 373 return """ 374int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared, 375 {args}) {{ 376 {setargs} 377 378 return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL); 379}} 380 381int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{ 382 size_t _gs = 0; 383 size_t _ls = 0; 384 int _err; 385 386 if (_nd != 1) return GA_UNSUPPORTED_ERROR; 387 388 _err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls); 389 if (_err != GA_NO_ERROR) 390 return _err; 391 392 {setargs} 393 394 return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL); 395}} 396 """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname, 397 kname=k.objvar) 398 399 def c_support_code_apply(self, node, name): 400 kernels = self.gpu_kernels(node, name) 401 codes = '\n'.join(self._generate_kernel_code(k) for k in kernels) 402 return codes 403 404 def c_support_code_struct(self, node, name): 405 kernels = self.gpu_kernels(node, name) 406 kvars = '\n'.join(self._generate_kernel_vars(k) for k in kernels) 407 wrappers = '\n'.join(self._generate_kernel_wrap(k) for k in kernels) 408 return kvars + '\n' + wrappers 409 410 def _generate_zeros(self, k): 411 return """memset(&%(v)s, 0, sizeof(%(v)s));""" % dict(v=k.objvar) 412 413 def _generate_kernel_init(self, k, fail, ctx): 414 return """{ 415 int err; 416 int types[%(numargs)u] = {%(types)s}; 417 if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1, 418 &%(cname)s, NULL, "%(kname)s", %(numargs)u, 419 types, %(flags)s, NULL)) != GA_NO_ERROR) { 420 PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s", 421 err, gpucontext_error(%(ctx)s->ctx, err)); 422 %(fail)s 423 } 424}""" % dict(numargs=len(k.params), types=k._get_c_types(), 425 ovar=k.objvar, kname=k.name, cname=k.codevar, 426 flags=k._get_c_flags(), fail=fail, ctx=ctx) 427 428 def c_init_code_struct(self, node, name, sub): 429 ctx = self.get_gpu_context_c_name(sub['params']) 430 kernels = self.gpu_kernels(node, name) 431 inits_0 = '\n'.join(self._generate_zeros(k) for k in kernels) 432 inits = '\n'.join(self._generate_kernel_init(k, sub['fail'], ctx) 433 for k in kernels) 434 return '\n'.join([inits_0, inits]) 435 436 def _generate_kernel_cleanup(self, k): 437 return "GpuKernel_clear(&%(ovar)s);" % dict(ovar=k.objvar) 438 439 def c_cleanup_code_struct(self, node, name): 440 kernels = self.gpu_kernels(node, name) 441 cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels) 442 return cleanups 443 444 # This is a shorthand for if your op only has a fixed version 445 # You can reimplement it, but make sure to call kernel_version() 446 def c_code_cache_version_apply(self, node): 447 v = self.c_code_cache_version() 448 if not v: 449 return () 450 return (v, self.kernel_version(node)) 451 452 def kernel_version(self, node): 453 """ 454 If you override :meth:`c_code_cache_version_apply`, call this 455 method to have the version of the kernel support code. 456 457 Parameters 458 ---------- 459 node : apply node 460 The node that we need the cache version for. 461 462 """ 463 return (9,) 464 465 466def forward_string_meth(name): 467 def f(*args): 468 res = getattr(GpuKernelBase, name)(*args) 469 try: 470 res = res + '\n' + getattr(COp, name)(*args) 471 except MethodNotDefined: 472 pass 473 return res 474 f.__name__ = name 475 return f 476 477 478def get_dtype(s): 479 if s == '*': 480 return gpuarray.GpuArray 481 if s == 'size': 482 return gpuarray.SIZE 483 if s == 'ssize': 484 return gpuarray.SSIZE 485 else: 486 return np.dtype(s) 487 488 489class CGpuKernelBase(COp, GpuKernelBase): 490 """ 491 Class to combine GpuKernelBase and COp. 492 493 It adds a new section type 'kernels' where you can define kernels 494 with the '#kernel' tag 495 """ 496 SECTIONS = copy.copy(COp.SECTIONS) 497 SECTIONS.add('kernels') 498 499 kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE) 500 501 get_params = GpuKernelBase.get_params 502 c_support_code_apply = forward_string_meth('c_support_code_apply') 503 c_support_code_struct = forward_string_meth('c_support_code_struct') 504 c_init_code_struct = forward_string_meth('c_init_code_struct') 505 c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct') 506 507 def c_code_cache_version_apply(self, node): 508 return GpuKernelBase.c_code_cache_version_apply(self, node) 509 510 def _type_macros(self, node): 511 define_template = "#define %s %s\n" 512 undef_template = "#undef %s\n" 513 define_macros = [] 514 undef_macros = [] 515 for i, v in enumerate(node.inputs): 516 if isinstance(v.type, GpuArrayType): 517 macro_name = "DTYPE_INPUT_%d" % (i,) 518 macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype) 519 define_macros.append( 520 define_template % 521 (macro_name, macro_value)) 522 undef_macros.append(undef_template % macro_name) 523 for i, v in enumerate(node.outputs): 524 if isinstance(v.type, GpuArrayType): 525 macro_name = "DTYPE_OUTPUT_%d" % (i,) 526 macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype) 527 define_macros.append( 528 define_template % 529 (macro_name, macro_value)) 530 undef_macros.append(undef_template % macro_name) 531 532 return ''.join(define_macros), ''.join(undef_macros) 533 534 def gpu_kernels(self, node, name): 535 if hasattr(self, '_cached_kernels'): 536 return self._cached_kernels 537 if 'kernels' in self.code_sections: 538 code = self.code_sections['kernels'] 539 split = self.kernel_re.split(code) 540 if split[0].strip() != '': 541 raise ValueError("Stray code in kernels section before the " 542 "first #kernel statement.") 543 def_macros, undef_macros = self._type_macros(node) 544 n = 1 545 res = [] 546 while n < len(split): 547 kspec = split[n] 548 kcode = split[n + 1] 549 splt2 = kspec.split(':') 550 if len(splt2) != 3: 551 raise ValueError("Bad kernel spec: %s" % (kspec,)) 552 kname = splt2[0].strip() 553 ktypes = [get_dtype(s.strip()) for s in splt2[1].split(',')] 554 kflags = splt2[2].strip() 555 kcode = def_macros + '\n' + kcode + '\n' + undef_macros 556 res.append(Kernel(kcode, ktypes, kname, 557 flags=dict(cflags=kflags))) 558 n += 2 559 self._cached_kernels = res 560 return res 561 else: 562 return GpuKernelBase.gpu_kernels(self, node, name) 563 564 565class HostFromGpu(Op): 566 """ 567 Transfer data to CPU. 568 569 """ 570 __props__ = () 571 _f16_ok = True 572 573 def __str__(self): 574 return 'HostFromGpu(gpuarray)' 575 576 def make_node(self, x): 577 if not isinstance(x.type, GpuArrayType): 578 raise TypeError(x) 579 out_var = tensor.TensorType(dtype=x.dtype, 580 broadcastable=x.broadcastable)() 581 # Keep the special comparison if there is one. 582 values_eq_approx = getattr(x.tag, 'values_eq_approx', None) 583 if values_eq_approx: 584 out_var.tag.values_eq_approx = EQ_MAP.get(values_eq_approx, 585 values_eq_approx) 586 return Apply(self, [x], [out_var]) 587 588 def perform(self, node, inp, out): 589 x, = inp 590 z, = out 591 z[0] = np.asarray(x) 592 593 def c_code(self, node, name, inputs, outputs, sub): 594 return """ 595 GpuArray %(name)s_ga_s; 596 GpuArray *%(name)s_ga = NULL; 597 int %(name)serr; 598 PyArray_Descr *%(name)s_dtype; 599 if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) { 600 if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) { 601 PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy"); 602 %(fail)s; 603 } 604 %(name)s_ga = &%(name)s_ga_s; 605 } else { 606 %(name)s_ga = &%(inp)s->ga; 607 } 608 %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode); 609 Py_XDECREF(%(out)s); 610 // PyArray_Empty below steals a reference to the dtype we pass it 611 // so we need an extra one to spare. 612 Py_INCREF(%(name)s_dtype); 613 %(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd, 614 (npy_intp *)%(inp)s->ga.dimensions, 615 %(name)s_dtype, 616 (%(inp)s->ga.flags & GA_F_CONTIGUOUS) && 617 !(%(inp)s->ga.flags & GA_C_CONTIGUOUS)); 618 if (%(out)s == NULL) { 619 if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga); 620 %(fail)s 621 } 622 Py_BEGIN_ALLOW_THREADS 623 %(name)serr = GpuArray_read(PyArray_DATA(%(out)s), 624 PyArray_NBYTES(%(out)s), 625 %(name)s_ga); 626 Py_END_ALLOW_THREADS 627 if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga); 628 if (%(name)serr != GA_NO_ERROR) { 629 PyErr_SetString(PyExc_RuntimeError, "Could not read device data."); 630 %(fail)s 631 } 632 """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0], 633 'out': outputs[0]} 634 635 def c_code_cache_version(self): 636 return (2,) 637 638 def grad(self, inputs, grads): 639 gz, = grads 640 return [GpuFromHost(inputs[0].type.context_name)(gz)] 641 642 def R_op(self, inputs, eval_points): 643 ev, = eval_points 644 return [self(ev)] 645 646 def infer_shape(self, node, xshp): 647 return xshp 648 649host_from_gpu = HostFromGpu() 650 651 652class GpuFromHost(Op): 653 """ 654 Transfer data to GPU. 655 656 """ 657 __props__ = ('context_name',) 658 _f16_ok = True 659 params_type = gpu_context_type 660 661 def __init__(self, context_name): 662 self.context_name = context_name 663 664 def __str__(self): 665 return 'GpuFromHost<%s>' % (self.context_name,) 666 667 def make_node(self, x): 668 if not isinstance(x.type, tensor.TensorType): 669 raise TypeError(x) 670 if "complex" in x.dtype: 671 raise TypeError("complex not supported in the new gpuarray back-end.", x) 672 out_var = GpuArrayType(broadcastable=x.broadcastable, 673 context_name=self.context_name, 674 dtype=x.dtype)() 675 # Keep the special comparison if there is one. 676 values_eq_approx = getattr(x.tag, 'values_eq_approx', None) 677 if values_eq_approx: 678 out_var.tag.values_eq_approx = EQ_MAP.get(values_eq_approx, 679 values_eq_approx) 680 return Apply(self, [x], [out_var]) 681 682 def get_params(self, node): 683 return get_context(self.context_name) 684 685 def perform(self, node, inp, out, ctx): 686 x, = inp 687 z, = out 688 z[0] = gpuarray.array(x, context=ctx) 689 690 def grad(self, inputs, grads): 691 gz, = grads 692 return [as_gpuarray_variable( 693 gz, context_name=self.context_name).transfer('cpu')] 694 695 def R_op(self, inputs, eval_points): 696 ev, = eval_points 697 return [self(ev)] 698 699 def infer_shape(self, node, xshp): 700 return xshp 701 702 def c_headers(self): 703 return ["gpuarray_helper.h"] 704 705 def c_header_dirs(self): 706 return [gpuarray_helper_inc_dir()] 707 708 def c_code(self, node, name, inputs, outputs, sub): 709 return """ 710 PyArrayObject *%(name)s_tmp; 711 %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s); 712 int err; 713 if (%(name)s_tmp == NULL) 714 %(fail)s 715 716 if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) || 717 !theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp), 718 (size_t *)PyArray_DIMS(%(name)s_tmp), 719 get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) { 720 Py_XDECREF(%(out)s); 721 %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp), 722 (size_t *)PyArray_DIMS(%(name)s_tmp), 723 get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)), 724 GA_C_ORDER, %(ctx)s, Py_None); 725 if (%(out)s == NULL) { 726 Py_DECREF(%(name)s_tmp); 727 %(fail)s; 728 } 729 } 730 731 Py_BEGIN_ALLOW_THREADS 732 err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp), 733 PyArray_NBYTES(%(name)s_tmp)); 734 Py_END_ALLOW_THREADS 735 Py_DECREF(%(name)s_tmp); 736 if (err != GA_NO_ERROR) { 737 PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu"); 738 %(fail)s; 739 } 740 """ % {'name': name, 'inp': inputs[0], 'ctx': sub['params'], 741 'out': outputs[0], 'fail': sub['fail']} 742 743 def c_code_cache_version(self): 744 return (10,) 745 746 747class GpuToGpu(Op): 748 """ 749 Transfer data between GPUs. 750 751 """ 752 __props__ = ('context_name',) 753 _f16_ok = True 754 params_type = gpu_context_type 755 756 def __init__(self, context_name): 757 self.context_name = context_name 758 759 def __str__(self): 760 return 'GpuToGpu<%s>' % (self.context_name,) 761 762 def make_node(self, x): 763 if not isinstance(x.type, GpuArrayType): 764 raise TypeError(x) 765 return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable, 766 context_name=self.context_name, 767 dtype=x.dtype)()]) 768 769 def get_params(self, node): 770 return get_context(self.context_name) 771 772 def perform(self, node, inp, out, ctx): 773 x, = inp 774 z, = out 775 z[0] = x.transfer(ctx) 776 777 def grad(self, inputs, grads): 778 gz, = grads 779 return [GpuToGpu(inputs[0].type.context_name)(gz)] 780 781 def R_op(self, inputs, eval_points): 782 return self(eval_points[0]) 783 784 def infer_shape(self, node, xshp): 785 return xshp 786 787 def c_code(self, node, name, inputs, outputs, sub): 788 return """ 789 Py_XDECREF(%(out)s); 790 %(out)s = pygpu_empty(%(inp)s->ga.nd, 791 %(inp)s->ga.dimensions, 792 %(inp)s->ga.typecode, 793 GpuArray_IS_C_CONTIGUOUS(&(%(inp)s->ga)) ? GA_C_ORDER:GA_F_ORDER, 794 %(ctx)s, Py_None); 795 if (%(out)s == NULL) { 796 %(fail)s 797 } 798 if (pygpu_transfer(%(out)s, %(inp)s)) { 799 %(fail)s 800 } 801 """ % {'inp': inputs[0], 'ctx': sub['params'], 802 'out': outputs[0], 'fail': sub['fail']} 803 804 def c_code_cache_version(self): 805 return (1,) 806 807 808class GpuAlloc(HideC, Alloc): 809 """ 810 Allocate initialized memory on the GPU. 811 812 Parameters 813 ---------- 814 context_name : str 815 The name of the context in which to allocate memory 816 memset_0 : bool 817 It's only an optimized version. True, it means the 818 value is always 0, so the c code call memset as it is faster. 819 820 """ 821 822 __props__ = ('memset_0', 'context_name') 823 _f16_ok = True 824 params_type = ParamsType(context=gpu_context_type, memset_0=bool_t) 825 826 def __init__(self, context_name, memset_0=False): 827 self.context_name = context_name 828 self.memset_0 = memset_0 829 830 def get_params(self, node): 831 return self.params_type.get_params(context=get_context(self.context_name), 832 memset_0=self.memset_0) 833 834 def __str__(self): 835 # Hide the memset parameter when not used to prevent confusion. 836 if self.memset_0: 837 m = "{memset_0=True}" 838 else: 839 m = "" 840 return "%s<%s>%s" % (self.__class__.__name__, self.context_name, m) 841 842 def make_node(self, value, *shape): 843 value = as_gpuarray_variable(value, context_name=self.context_name) 844 sh, bcast = alloc_validate_shape(shape) 845 if value.ndim > len(sh): 846 TypeError("The GpuAlloc value to use has more dimensions " 847 "than the specified shape", value.ndim, len(sh)) 848 otype = value.type.clone(broadcastable=bcast) 849 return Apply(self, [value] + sh, [otype()]) 850 851 def c_headers(self): 852 return ['<numpy_compat.h>'] 853 854 def perform(self, node, inputs, outs, params): 855 out, = outs 856 v = inputs[0] 857 sh = tuple(map(int, inputs[1:])) 858 if out[0] is None or out[0].shape != sh: 859 if self.memset_0: 860 out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=params.context) 861 else: 862 out[0] = gpuarray.empty(sh, dtype=v.dtype, context=params.context) 863 out[0][...] = v 864 else: 865 out[0][...] = v 866 867 def c_code(self, node, name, inp, out, sub): 868 vv = inp[0] 869 ndim = len(inp[1:]) 870 zz, = out 871 872 code = """ 873 int i; 874 size_t %(name)s_shape[%(ndim)s]; 875 """ % dict(name=name, ndim=ndim) 876 877 for i, shp_i in enumerate(inp[1:]): 878 code += """ 879 %(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; 880 """ % dict(name=name, i=i, shp_i=shp_i) 881 882 code += """ 883 int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s); 884 885 if (!need_new_out) 886 for (i = 0; i < %(ndim)s; i++) 887 need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i]; 888 889 if (need_new_out && (%(params)s->memset_0)) { 890 //pygpu_zeros can be faster then empty followed by memset. 891 Py_XDECREF(%(zz)s); 892 %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape, 893 %(vv)s->ga.typecode, GA_C_ORDER, 894 %(params)s->context, Py_None); 895 if (!%(zz)s) { 896 %(fail)s 897 } 898 } else { 899 if (need_new_out) { 900 Py_XDECREF(%(zz)s); 901 %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape, 902 %(vv)s->ga.typecode, GA_C_ORDER, 903 %(params)s->context, Py_None); 904 if (!%(zz)s) { 905 %(fail)s 906 } 907 } 908 if (%(params)s->memset_0 && GpuArray_ISONESEGMENT(&%(zz)s->ga)) 909 { 910 int err = GpuArray_memset(&%(zz)s->ga, 0); 911 if (err != GA_NO_ERROR) 912 { 913 PyErr_Format(PyExc_MemoryError, 914 "GpuAlloc: Error memsetting %%llu" 915 " element of device memory to 0.", 916 (unsigned long long)PyGpuArray_SIZE(%(zz)s)); 917 %(fail)s; 918 } 919 } 920 else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != 921 GA_NO_ERROR) { 922 PyErr_SetString(PyExc_ValueError, "setarray failed"); 923 %(fail)s 924 } 925 } 926 """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, params=sub['params'], 927 fail=sub['fail']) 928 929 return code 930 931 def c_code_cache_version(self): 932 return (4,) 933 934 def do_constant_folding(self, node): 935 from . import subtensor, blas 936 for client in node.outputs[0].clients: 937 if client[0] == 'output': 938 # If the output is a constant, it will have to be deepcopied 939 # each time the function is called. So we do not fold. 940 return False 941 # The following ops work inplace of their input id 0. 942 elif (client[1] == 0 and 943 # Ops that will work inplace on the Alloc. So if they 944 # get constant_folded, they would copy the 945 # constant and this is less efficients. 946 947 # Not doing the constant folding could also lower 948 # the peak memory usage, as we the "constant" won't 949 # always exists. 950 isinstance(client[0].op, 951 (subtensor.GpuIncSubtensor, 952 subtensor.GpuAdvancedIncSubtensor1, 953 subtensor.GpuAdvancedIncSubtensor1_dev20, 954 subtensor.GpuAdvancedIncSubtensor, 955 blas.GpuGemm, blas.GpuGemv, 956 blas.GpuGer) 957 )): 958 return False 959 # If the clients is a transfer, we don't want to fold. We 960 # let the moving opt finish before deciding what to do. 961 elif isinstance(client[0].op, HostFromGpu): 962 return False 963 return True 964 965 966class GpuAllocEmpty(HideC, AllocEmpty): 967 """ 968 Allocate uninitialized memory on the GPU. 969 970 """ 971 __props__ = ('dtype', 'context_name') 972 _f16_ok = True 973 params_type = ParamsType(context=gpu_context_type, 974 typecode=int32_t) 975 976 def __init__(self, dtype, context_name): 977 self.dtype = dtype 978 self.context_name = context_name 979 980 @property 981 def typecode(self): 982 return gpuarray.dtype_to_typecode(self.dtype) 983 984 def get_params(self, node): 985 return self.params_type.get_params(context=get_context(self.context_name), 986 typecode=self.typecode) 987 988 def make_node(self, *shape): 989 sh, bcast = alloc_validate_shape(shape) 990 output = GpuArrayType(dtype=self.dtype, broadcastable=bcast, 991 context_name=self.context_name)() 992 output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true 993 # The outut can contain nan/inf. 994 output.type.filter_checks_isfinite = False 995 output.tag.nan_guard_mode_check = False 996 return Apply(self, sh, [output]) 997 998 def debug_perform(self, node, inputs, out_, params): 999 self.perform(node, inputs, out_, params) 1000 out_[0][0][:] = -123456789 1001 1002 def perform(self, node, inputs, out_, params): 1003 out = out_[0] 1004 sh = [int(i) for i in inputs] 1005 if out[0] is None or out[0].shape != sh: 1006 out[0] = pygpu.empty(sh, dtype=self.dtype, context=params.context) 1007 # if out[0] is the right shape, we just return it 1008 1009 def c_headers(self): 1010 return ['<gpuarray_helper.h>'] 1011 1012 def c_header_dirs(self): 1013 return [gpuarray_helper_inc_dir()] 1014 1015 def c_code(self, node, name, inp, out, sub): 1016 ndim = len(inp) 1017 zz = out[0] 1018 fail = sub['fail'] 1019 1020 code = [""" 1021int i; 1022size_t shape[%(ndim)s]; 1023""" % dict(ndim=ndim)] 1024 1025 for i, shp_i in enumerate(inp): 1026 code.append(""" 1027shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; 1028""" % dict(i=i, shp_i=shp_i)) 1029 1030 code.append(""" 1031if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(params)s->typecode, GA_C_ORDER, 1032 %(params)s->context)) { 1033 %(fail)s 1034} 1035""" % dict(zz=zz, ndim=ndim, fail=fail, params=sub['params'])) 1036 1037 return ''.join(code) 1038 1039 def c_code_cache_version(self): 1040 return (2,) 1041 1042 def do_constant_folding(self, node): 1043 return False 1044 1045 def infer_shape(self, node, input_shapes): 1046 return [node.inputs] 1047 1048 def grad(self, *args): 1049 # Don't reuse the grad implementation from Alloc 1050 raise NotImplementedError("grad disabled") 1051 1052 1053def empty_like(var): 1054 return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape) 1055 1056 1057class GpuContiguous(Op): 1058 """ 1059 Return a C contiguous version of the input. 1060 1061 This may either pass the object as-is (if already C contiguous) or 1062 make a copy. 1063 1064 """ 1065 __props__ = () 1066 view_map = {0: [0]} 1067 _f16_ok = True 1068 1069 def grad(self, inputs, dout): 1070 x, = inputs 1071 dout, = dout 1072 dout = as_gpuarray_variable(dout, context_name=infer_context_name(x)) 1073 1074 return [dout] 1075 1076 def make_node(self, input): 1077 input = as_gpuarray_variable(input, 1078 context_name=infer_context_name(input)) 1079 return Apply(self, [input], [input.type()]) 1080 1081 def c_header_dirs(self): 1082 return [gpuarray_helper_inc_dir()] 1083 1084 def c_headers(self): 1085 return ['<gpuarray_helper.h>'] 1086 1087 def c_code_cache_version(self): 1088 return (4,) 1089 1090 def c_code(self, node, name, inp, out, sub): 1091 return """ 1092 { 1093 if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))) { 1094 Py_XDECREF(%(z)s); 1095 %(z)s = %(input)s; 1096 Py_INCREF(%(z)s); 1097 1098 } else if (NULL == %(z)s 1099 || !theano_size_check(%(z)s, PyGpuArray_NDIM(%(input)s), PyGpuArray_DIMS(%(input)s), 1100 %(input)s->ga.typecode) 1101 || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga))) 1102 { 1103 Py_XDECREF(%(z)s); 1104 %(z)s = pygpu_copy(%(input)s, GA_C_ORDER); 1105 if (!%(z)s) 1106 { 1107 %(fail)s; 1108 } 1109 } else if(pygpu_move(%(z)s, %(input)s) == -1) { 1110 %(fail)s; 1111 } 1112 } 1113 """ % dict(input=inp[0], z=out[0], fail=sub['fail']) 1114 1115 def perform(self, node, inp, out_): 1116 x, = inp 1117 out, = out_ 1118 out[0] = pygpu.ascontiguousarray(x) 1119 1120gpu_contiguous = GpuContiguous() 1121 1122 1123class GpuReshape(HideC, tensor.Reshape): 1124 """ 1125 Reshape for GPU variables. 1126 1127 """ 1128 1129 _f16_ok = True 1130 1131 # __hash__, __eq__, __str__ come from tensor.Reshape 1132 def make_node(self, x, shp): 1133 ctx_name = infer_context_name(x) 1134 x = as_gpuarray_variable(x, context_name=ctx_name) 1135 shp = tensor.as_tensor_variable(shp) 1136 res = x.transfer('cpu').reshape(shp, ndim=self.ndim) 1137 otype = GpuArrayType(dtype=res.dtype, 1138 broadcastable=res.broadcastable, 1139 context_name=ctx_name) 1140 return Apply(self, [x, shp], [otype()]) 1141 1142 def perform(self, node, inp, out_, params): 1143 x, shp = inp 1144 out, = out_ 1145 if (len(shp) != self.ndim): 1146 raise ValueError('shape argument to GpuReshape.perform' 1147 ' has incorrect length %i' 1148 ', should be %i' % (len(shp), self.ndim), shp) 1149 1150 if shp.prod() != x.size: 1151 # We need to do check here to raise the same error as NumPy. 1152 # We should make pygpu do the same. 1153 ss = 1 1154 nb_m1 = 0 1155 for i in shp: 1156 if i == -1: 1157 nb_m1 += 1 1158 else: 1159 ss *= i 1160 if nb_m1 > 1: 1161 raise ValueError("Only one -1 is accepted in the new shape") 1162 elif nb_m1 == 1: 1163 if (x.size % ss) != 0: 1164 raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.") 1165 else: 1166 raise ValueError("total size of new array must be unchanged") 1167 out[0] = x.reshape(tuple(shp)) 1168 1169 def c_code_cache_version(self): 1170 return (3,) 1171 1172 def c_code(self, node, name, inputs, outputs, sub): 1173 x, shape = inputs 1174 output, = outputs 1175 sdtype = node.inputs[1].type.dtype_specs()[1] 1176 just_fail = sub['fail'] 1177 fail = """{ 1178 free(new_dims); 1179 %(just_fail)s 1180 }""" % dict(just_fail=just_fail) 1181 params = sub['params'] 1182 return """ 1183 size_t old_size = 1, new_size = 1; 1184 size_t* new_dims = NULL; 1185 int compute_axis = -1; 1186 1187 assert (PyArray_NDIM(%(shape)s) == 1); 1188 if (PyArray_DIM(%(shape)s, 0) != %(params)s->ndim) 1189 { 1190 PyErr_Format(PyExc_ValueError, 1191 "GpuReshape: given shape is of incorrect " 1192 "length (%%d should be %%d).", 1193 PyArray_DIM(%(shape)s, 0), %(params)s->ndim); 1194 %(just_fail)s; 1195 } 1196 1197 new_dims = (size_t*) malloc(sizeof(size_t) * %(params)s->ndim); 1198 if (new_dims == NULL) { 1199 PyErr_NoMemory(); 1200 %(just_fail)s 1201 } 1202 1203 for (size_t i = 0; i < %(x)s->ga.nd; ++i) 1204 old_size *= %(x)s->ga.dimensions[i]; 1205 1206 for (size_t i = 0; i < %(params)s->ndim; ++i) 1207 { 1208 new_dims[i] = ((%(sdtype)s*)( 1209 PyArray_BYTES(%(shape)s) + 1210 i * PyArray_STRIDES(%(shape)s)[0]))[0]; 1211 if (new_dims[i] == -1) 1212 { 1213 if (compute_axis != -1) 1214 { 1215 PyErr_Format(PyExc_ValueError, 1216 "GpuReshape: only one -1 is accepted " 1217 "in the new shape, but got two at " 1218 "indices %%d and %%zu.", 1219 compute_axis, i); 1220 %(fail)s; 1221 } 1222 compute_axis = i; 1223 } 1224 else 1225 new_size *= new_dims[i]; 1226 } 1227 1228 if (compute_axis == -1 && new_size != old_size) 1229 { 1230 PyErr_Format(PyExc_ValueError, 1231 "GpuReshape: trying to reshape an array of " 1232 "total size %%zu into an array of total size " 1233 "%%zu.", old_size, new_size); 1234 %(fail)s; 1235 } 1236 else if (compute_axis != -1 && old_size %% new_size != 0) 1237 { 1238 PyErr_Format(PyExc_ValueError, 1239 "GpuReshape: -1 axis found at index %%d in " 1240 "new shape but the total size of the array " 1241 "(%%zu) is not divisible by the given shapes " 1242 "(%%zu).", compute_axis, old_size, new_size); 1243 %(fail)s; 1244 } 1245 1246 Py_XDECREF(%(output)s); 1247 %(output)s = pygpu_reshape(%(x)s, %(params)s->ndim, new_dims, 1248 GA_C_ORDER, 0, compute_axis); 1249 free(new_dims); 1250 if (%(output)s == NULL) 1251 { 1252 %(just_fail)s; 1253 } 1254 """ % locals() 1255 1256 1257class GpuJoin(HideC, Join): 1258 """ 1259 Join for GPU. 1260 1261 """ 1262 _f16_ok = True 1263 __props__ = ("view",) 1264 params_type = gpu_context_type 1265 1266 def __init__(self, view=-1): 1267 self.view = view 1268 if view != -1: 1269 # since the first input is always the axis, the tensors 1270 # start from index 1. 1271 self.view_map = {0: [1 + view]} 1272 1273 def __str__(self): 1274 return Join.__str__(self) 1275 1276 def make_node(self, axis, *tensors): 1277 node = Join.make_node(self, axis, *tensors) 1278 1279 ctx_name = infer_context_name(*tensors) 1280 1281 def agv(v): 1282 return as_gpuarray_variable(v, context_name=ctx_name) 1283 1284 return Apply(self, [node.inputs[0]] + list(map(agv, tensors)), 1285 [GpuArrayType(broadcastable=node.outputs[0].broadcastable, 1286 dtype=node.outputs[0].dtype, 1287 context_name=ctx_name)()]) 1288 1289 def get_params(self, node): 1290 return node.outputs[0].type.context 1291 1292 def perform(self, node, axis_and_tensors, out_, ctx): 1293 out, = out_ 1294 view = self.view 1295 axis = int(axis_and_tensors[0]) 1296 tensors = axis_and_tensors[1:] 1297 1298 if axis < -axis_and_tensors[1].ndim: 1299 raise IndexError 1300 if axis < 0: 1301 axis += axis_and_tensors[1].ndim 1302 # we check these tensors for being empty. 1303 if (view != -1) and np.all( 1304 [tensor.shape[axis] == 0 for tensor in 1305 tensors[0:view] + tensors[view + 1:]]): 1306 out[0] = tensors[view] 1307 else: 1308 out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype( 1309 node.outputs[0].dtype) 1310 1311 def c_code_cache_version(self): 1312 return (3,) 1313 1314 def c_support_code(self): 1315 return """ 1316 #if PY_MAJOR_VERSION >= 3 1317 #define PyInt_AsLong PyLong_AsLong 1318 #endif 1319 """ 1320 1321 def c_headers(self): 1322 return ['<numpy_compat.h>'] 1323 1324 def c_code(self, node, name, inputs, out_, sub): 1325 axis, tensors = inputs[0], inputs[1:] 1326 copy_to_list = [] 1327 restype = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) 1328 view = self.view 1329 non_empty_tensor = tensors[view] 1330 for i, inp in enumerate(tensors): 1331 copy_to_list.append("als[%s] = &%s->ga;" % (i, inp)) 1332 1333 n = len(tensors) 1334 fail = sub['fail'] 1335 out = out_[0] 1336 copy_inputs_to_list = '\n'.join(copy_to_list) 1337 ctx = sub['params'] 1338 1339 code = """ 1340 const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * 1341 %(n)s); 1342 if (als == NULL) { 1343 PyErr_NoMemory(); 1344 %(fail)s 1345 } 1346 %(copy_inputs_to_list)s 1347 Py_XDECREF(%(out)s); 1348 { 1349 int axis = PyInt_AsLong((PyObject *)%(axis)s); 1350 if (axis < 0) { 1351 if (axis == -1 && PyErr_Occurred()) { 1352 %(fail)s 1353 } 1354 axis += als[0]->nd; 1355 if (axis < 0) { 1356 PyErr_SetString(PyExc_IndexError, "invalid axis"); 1357 %(fail)s 1358 } 1359 } 1360 1361 int tensors_lens_sum; 1362 if(%(view)s != -1) { 1363 tensors_lens_sum = 0; 1364 for(int i=0; i < %(n)s; i++){ 1365 tensors_lens_sum += als[i]->dimensions[axis]; 1366 } 1367 tensors_lens_sum -= PyGpuArray_DIM(%(non_empty_tensor)s, axis); 1368 } 1369 1370 if(%(view)s != -1 && tensors_lens_sum == 0) { 1371 Py_INCREF(%(non_empty_tensor)s); 1372 %(out)s = %(non_empty_tensor)s; 1373 }else{ 1374 %(out)s = pygpu_concatenate(als, %(n)s, axis, 1375 %(restype)s, (PyObject *)&PyGpuArrayType, 1376 %(ctx)s); 1377 } 1378 } 1379 PyMem_Free(als); 1380 if (%(out)s == NULL) 1381 %(fail)s 1382 1383 """ % locals() 1384 return code 1385 1386gpu_join = GpuJoin() 1387 1388 1389class GpuSplit(HideC, Split): 1390 """ 1391 Split for GPU. 1392 1393 """ 1394 _f16_ok = True 1395 1396 def __init__(self, len_splits): 1397 super(GpuSplit, self).__init__(len_splits) 1398 # The GPU version of Split returns splits as views of the input. 1399 self.view_map = {} 1400 for i in xrange(self.len_splits): 1401 self.view_map[i] = [0] 1402 1403 def make_node(self, x, axis, splits): 1404 node = Split.make_node(self, x, axis, splits) 1405 x = as_gpuarray_variable(x, infer_context_name(x)) 1406 outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable, 1407 context_name=x.type.context_name)() 1408 for o in node.outputs] 1409 return Apply(self, [x] + node.inputs[1:], outs) 1410 1411 # we reuse the perform of the CPU op, which is suitable 1412 1413 def c_code_cache_version(self): 1414 return (2,) 1415 1416 def c_headers(self): 1417 return ['<numpy_compat.h>', '<gpuarray_helper.h>'] 1418 1419 def c_header_dirs(self): 1420 return [pygpu.get_include(), gpuarray_helper_inc_dir()] 1421 1422 def c_code(self, node, name, inputs, outputs, sub): 1423 if self.len_splits == 0: 1424 # There are no outputs, then nothing to do. 1425 return '' 1426 1427 # outputs_pointers lists the addresses of the pointers to the outputs. 1428 outputs_pointers = '&' + (', &'.join(outputs)) 1429 x, axis, splits = inputs 1430 fail = sub['fail'] 1431 splits_dtype = node.inputs[2].type.dtype_specs()[1] 1432 axis_dtype = node.inputs[1].type.dtype_specs()[1] 1433 expected_splits_count = self.len_splits 1434 1435 main_code = """ 1436 int ndim = PyGpuArray_NDIM(%(x)s); 1437 int axis = (int)(*(%(axis_dtype)s*)PyArray_GETPTR1(%(axis)s, 0)); 1438 int splits_count = PyArray_DIM(%(splits)s, 0); 1439 size_t len_along_axis, sum_of_splits = 0; 1440 %(splits_dtype)s current_split_length; 1441 size_t* split_points = NULL; 1442 GpuArray* split_views = NULL; 1443 GpuArray** split_views_pointers = NULL; 1444 int i, j; 1445 PyGpuArrayObject** outputs[] = {%(outputs_pointers)s}; 1446 1447 /* Check inputs. */ 1448 1449 if (splits_count != %(expected_splits_count)s) { 1450 PyErr_Format(PyExc_ValueError, 1451 "GpuSplit: splits count (%%d) != expected count (%%d).", splits_count, %(expected_splits_count)s); 1452 %(fail)s 1453 } 1454 if (axis < 0) { 1455 axis += ndim; 1456 } 1457 if (axis < 0 || axis >= ndim) { 1458 PyErr_Format(PyExc_IndexError, "GpuSplit: invalid axis %%d for a %%d-D array.", axis, ndim); 1459 %(fail)s 1460 } 1461 len_along_axis = PyGpuArray_DIM(%(x)s, axis); 1462 for (i = 0; i < splits_count; ++i) { 1463 current_split_length = *(%(splits_dtype)s*)PyArray_GETPTR1(%(splits)s, i); 1464 if (current_split_length < 0) { 1465 PyErr_Format(PyExc_ValueError, 1466 "GpuSplit: you try to take a negative number (%%ld) of elements.", current_split_length); 1467 %(fail)s 1468 } 1469 sum_of_splits += current_split_length; 1470 } 1471 if (sum_of_splits != len_along_axis) { 1472 PyErr_Format(PyExc_ValueError, "GpuSplit: the splits sums to %%ld, expected %%ld.", sum_of_splits, len_along_axis); 1473 %(fail)s 1474 } 1475 1476 /* Compute splits views. */ 1477 1478 split_points = (size_t*) malloc((splits_count - 1) * sizeof(size_t)); 1479 if (split_points == NULL) { 1480 PyErr_NoMemory(); 1481 %(fail)s 1482 } 1483 split_points[0] = (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, 0) ); 1484 for(i = 1; i < splits_count - 1; ++i) { 1485 split_points[i] = split_points[i - 1] + (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, i) ); 1486 } 1487 split_views = (GpuArray*) malloc(splits_count * sizeof(GpuArray)); 1488 split_views_pointers = (GpuArray**) malloc(splits_count * sizeof(GpuArray*)); 1489 if (split_views == NULL || split_views_pointers == NULL) { 1490 PyErr_NoMemory(); 1491 free(split_views_pointers); 1492 free(split_views); 1493 free(split_points); 1494 %(fail)s 1495 } 1496 for (i = 0; i < splits_count; ++i) { 1497 split_views_pointers[i] = split_views + i; 1498 } 1499 if (GpuArray_split(split_views_pointers, &%(x)s->ga, splits_count - 1, split_points, axis) != GA_NO_ERROR) { 1500 PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to compute split."); 1501 for (i = 0; i < splits_count; ++i) { 1502 GpuArray_clear(split_views_pointers[i]); 1503 } 1504 free(split_views_pointers); 1505 free(split_views); 1506 free(split_points); 1507 %(fail)s 1508 } 1509 1510 /* Put split views into outputs. */ 1511 for (i = 0; i < splits_count; ++i) { 1512 PyGpuArrayObject** output = outputs[i]; 1513 Py_XDECREF(*output); 1514 *output = pygpu_fromgpudata( 1515 split_views[i].data, 1516 split_views[i].offset, 1517 split_views[i].typecode, 1518 split_views[i].nd, 1519 split_views[i].dimensions, 1520 split_views[i].strides, 1521 %(x)s->context, 1522 1, // output is writable 1523 Py_None, Py_None 1524 ); 1525 if (*output == NULL) { 1526 PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to update an output from a split view."); 1527 for (j = 0; j < splits_count; ++j) { 1528 GpuArray_clear(split_views_pointers[j]); 1529 } 1530 free(split_views_pointers); 1531 free(split_views); 1532 free(split_points); 1533 %(fail)s 1534 } 1535 } 1536 1537 /* Free memory. */ 1538 for (i = 0; i < splits_count; ++i) { 1539 GpuArray_clear(split_views_pointers[i]); 1540 } 1541 free(split_views_pointers); 1542 free(split_views); 1543 free(split_points); 1544 """ 1545 1546 return main_code % locals() 1547 1548 1549@theano.compile.profiling.register_profiler_printer 1550def profile_printer(message, compile_time, fct_call_time, 1551 apply_time, apply_cimpl, outputs_size, file): 1552 if any([x.op.__class__.__name__.lower().startswith("gpu") 1553 for x in apply_time.keys()]): 1554 local_time = sum(apply_time.values()) 1555 print('', file=file) 1556 print('Some info useful for gpu:', file=file) 1557 1558 fgraphs = set() 1559 for node in apply_time.keys(): 1560 fgraphs.add(node.fgraph) 1561 1562 cpu = 0 1563 gpu = 0 1564 trans = 0 1565 for node, t in iteritems(apply_time): 1566 if isinstance(node.op, (HostFromGpu, GpuFromHost)): 1567 trans += t 1568 elif node.op.__class__.__name__.lower().startswith("gpu"): 1569 gpu += t 1570 else: 1571 cpu += t 1572 print('', file=file) 1573 print(" Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op" % ( 1574 cpu, cpu / local_time * 100, gpu, gpu / local_time * 100, 1575 trans, trans / local_time * 100), file=file) 1576 1577 print('', file=file) 1578 print(" Theano function input that are float64", file=file) 1579 print(" <fct name> <input name> <input type> <str input>", file=file) 1580 for fg in fgraphs: 1581 for i in fg.inputs: 1582 if hasattr(i.type, 'dtype') and i.type.dtype == 'float64': 1583 print(' ', fg.name, i.name, i.type, i, file=file) 1584 1585 print('', file=file) 1586 print(" List of apply that don't have float64 as input but have float64 in outputs", file=file) 1587 print(" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)", file=file) 1588 print(' <Apply> <Apply position> <fct name> <inputs type> <outputs type>', file=file) 1589 for fg in fgraphs: 1590 for idx, node in enumerate(fg.toposort()): 1591 if (any(hasattr(i, 'dtype') and i.dtype == 'float64' 1592 for i in node.outputs) and 1593 not any(hasattr(i, 'dtype') and i.dtype == 'float64' 1594 for i in node.inputs)): 1595 1596 print(' ', str(node), idx, fg.name, end=' ', 1597 file=file) 1598 print(str([getattr(i, 'dtype', None) 1599 for i in node.inputs]), end=' ', file=file) 1600 print(str([getattr(i, 'dtype', None) 1601 for i in node.outputs]), file=file) 1602 print('', file=file) 1603 1604 1605class GpuEye(GpuKernelBase, Op): 1606 """ 1607 Eye for GPU. 1608 1609 """ 1610 __props__ = ('dtype', 'context_name') 1611 _f16_ok = True 1612 1613 def __init__(self, dtype=None, context_name=None): 1614 if dtype is None: 1615 dtype = config.floatX 1616 self.dtype = dtype 1617 self.context_name = context_name 1618 1619 def get_params(self, node): 1620 return get_context(self.context_name) 1621 1622 def make_node(self, n, m, k): 1623 n = tensor.as_tensor_variable(n) 1624 m = tensor.as_tensor_variable(m) 1625 k = tensor.as_tensor_variable(k) 1626 assert n.ndim == 0 1627 assert m.ndim == 0 1628 assert k.ndim == 0 1629 otype = GpuArrayType(dtype=self.dtype, 1630 broadcastable=(False, False), 1631 context_name=self.context_name) 1632 1633 return Apply(self, [n, m, k], [otype()]) 1634 1635 def infer_shape(self, node, in_shapes): 1636 out_shape = [node.inputs[0], node.inputs[1]] 1637 return [out_shape] 1638 1639 def grad(self, inp, grads): 1640 return [grad_undefined(self, i, inp[i]) 1641 for i in xrange(3)] 1642 1643 def gpu_kernels(self, node, name): 1644 code = """#include "cluda.h" 1645 1646KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, 1647 ga_size n, ga_size m, ga_ssize k) { 1648 a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off); 1649 ga_ssize coff = max(k, (ga_ssize) 0); 1650 ga_ssize roff = -min(k, (ga_ssize) 0); 1651 ga_size nb = (ga_size) min(n - roff, m - coff); 1652 for (ga_size i = LID_0; i < nb; i += LDIM_0) { 1653 a[(i + roff)*m + i + coff] = %(write_a)s(1); 1654 } 1655}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), 1656 name=name, write_a=write_w(self.dtype)) 1657 return [Kernel( 1658 code=code, name="eye", 1659 params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, 1660 gpuarray.SIZE, gpuarray.SSIZE], 1661 flags=Kernel.get_flags(self.dtype), 1662 objvar='k_eye_' + name)] 1663 1664 def c_code(self, node, name, inp, out, sub): 1665 if len(inp) == 2: 1666 n, m = inp 1667 k = 0 1668 elif len(inp) == 3: 1669 n, m, k = inp 1670 1671 z, = out 1672 fail = sub['fail'] 1673 ctx = sub['params'] 1674 typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) 1675 kname = self.gpu_kernels(node, name)[0].objvar 1676 s = """ 1677 size_t dims[2] = {0, 0}; 1678 size_t ls, gs; 1679 ssize_t k; 1680 size_t col_off; 1681 size_t row_off; 1682 int err; 1683 1684 dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0]; 1685 dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0]; 1686 k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0]; 1687 1688 Py_CLEAR(%(z)s); 1689 1690 %(z)s = pygpu_zeros(2, dims, 1691 %(typecode)s, 1692 GA_C_ORDER, 1693 %(ctx)s, Py_None); 1694 if (%(z)s == NULL) { 1695 %(fail)s 1696 } 1697 1698 ls = 1; 1699 gs = 256; 1700 col_off = (size_t) (k > 0?k:0); 1701 row_off = (size_t) (k < 0?-k:0); 1702 if (row_off < dims[0] && col_off < dims[1]) { 1703 err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset, 1704 dims[0], dims[1], k); 1705 if (err != GA_NO_ERROR) { 1706 PyErr_Format(PyExc_RuntimeError, 1707 "gpuarray error: kEye: %%s. n%%lu, m=%%lu.", 1708 GpuKernel_error(&%(kname)s, err), 1709 (unsigned long)dims[0], (unsigned long)dims[1]); 1710 %(fail)s; 1711 } 1712 } 1713 1714 """ % locals() 1715 1716 return s 1717 1718 def c_code_cache_version(self): 1719 return (10,) 1720 1721 1722class GpuTri(GpuKernelBase, Op): 1723 """ 1724 Tri for GPU. 1725 1726 """ 1727 __props__ = ('dtype', 'context_name') 1728 _f16_ok = True 1729 1730 def __init__(self, dtype=None, context_name=None): 1731 if dtype is None: 1732 dtype = config.floatX 1733 self.dtype = dtype 1734 self.context_name = context_name 1735 1736 def get_params(self, node): 1737 return get_context(self.context_name) 1738 1739 def make_node(self, n, m, k): 1740 n = tensor.as_tensor_variable(n) 1741 m = tensor.as_tensor_variable(m) 1742 k = tensor.as_tensor_variable(k) 1743 assert n.ndim == 0 1744 assert m.ndim == 0 1745 assert k.ndim == 0 1746 otype = GpuArrayType(dtype=self.dtype, 1747 broadcastable=(False, False), 1748 context_name=self.context_name) 1749 1750 return Apply(self, [n, m, k], [otype()]) 1751 1752 def infer_shape(self, node, in_shapes): 1753 out_shape = [node.inputs[0], node.inputs[1]] 1754 return [out_shape] 1755 1756 def grad(self, inp, grads): 1757 return [grad_undefined(self, i, inp[i]) 1758 for i in xrange(3)] 1759 1760 def gpu_kernels(self, node, name): 1761 code = """#include "cluda.h" 1762 1763KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off, 1764 ga_size n, ga_size m, ga_ssize k) { 1765 a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off); 1766 ga_ssize coff = max(k, (ga_ssize) 0); 1767 ga_ssize roff = -min(k, (ga_ssize) 0); 1768 for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) { 1769 for (ga_size j = 0; j <= min(i + coff,m-1); j++) { 1770 a[(i + roff)*m + j] = %(write_a)s(1); 1771 } 1772 } 1773}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), 1774 name=name, write_a=write_w(self.dtype)) 1775 return [Kernel( 1776 code=code, name="tri", 1777 params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, 1778 gpuarray.SIZE, gpuarray.SSIZE], 1779 flags=Kernel.get_flags(self.dtype), 1780 objvar='k_tri_' + name)] 1781 1782 def c_code(self, node, name, inp, out, sub): 1783 if len(inp) == 2: 1784 n, m = inp 1785 k = 0 1786 elif len(inp) == 3: 1787 n, m, k = inp 1788 1789 z, = out 1790 fail = sub['fail'] 1791 ctx = sub['params'] 1792 typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) 1793 kname = self.gpu_kernels(node, name)[0].objvar 1794 s = """ 1795 size_t dims[2] = {0, 0}; 1796 size_t ls, gs; 1797 ssize_t k; 1798 int err; 1799 1800 dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0]; 1801 dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0]; 1802 k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0]; 1803 1804 Py_CLEAR(%(z)s); 1805 1806 %(z)s = pygpu_zeros(2, dims, 1807 %(typecode)s, 1808 GA_C_ORDER, 1809 %(ctx)s, Py_None); 1810 if (%(z)s == NULL) { 1811 %(fail)s 1812 } 1813 1814 ls = 1; 1815 gs = 256; 1816 err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset, 1817 dims[0], dims[1], k); 1818 if (err != GA_NO_ERROR) { 1819 PyErr_Format(PyExc_RuntimeError, 1820 "gpuarray error: kTri: %%s. n%%lu, m=%%lu.", 1821 GpuKernel_error(&%(kname)s, err), 1822 (unsigned long)dims[0], (unsigned long)dims[1]); 1823 %(fail)s; 1824 } 1825 """ % locals() 1826 1827 return s 1828 1829 def c_code_cache_version(self): 1830 return (1,) 1831