1from __future__ import absolute_import, division, print_function 2 3import warnings 4 5import pkg_resources 6import numpy as np 7from numpy.linalg.linalg import LinAlgError 8 9import theano 10from theano import Op, config, tensor 11from theano.scalar import bool as bool_t 12from theano.gof import COp, ParamsType 13from theano.gpuarray import GpuArrayType 14 15from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous, gpuarray_helper_inc_dir, 16 infer_context_name) 17from .type import gpu_context_type 18 19try: 20 import pygpu 21 from pygpu.basic import triu, tril 22 pygpu_available = True 23except ImportError: 24 pygpu_available = False 25 26cusolver_available = False 27try: 28 import skcuda 29 from skcuda import cusolver 30 cusolver_available = True 31except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): 32 pass 33 34cublas_available = False 35try: 36 from skcuda import cublas 37 cublas_available = True 38except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): 39 pass 40 41if cusolver_available: 42 # Add cusolver call as it is missing in skcuda 43 # SPOTRS 44 cusolver._libcusolver.cusolverDnSpotrs.restype = int 45 cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p, 46 cusolver.ctypes.c_int, 47 cusolver.ctypes.c_int, 48 cusolver.ctypes.c_int, 49 cusolver.ctypes.c_void_p, 50 cusolver.ctypes.c_int, 51 cusolver.ctypes.c_void_p, 52 cusolver.ctypes.c_int, 53 cusolver.ctypes.c_void_p] 54 55 def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, 56 B, ldb, devInfo): 57 """ 58 Solve real single precision linear system for hermitian matrices. 59 References 60 ---------- 61 `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_ 62 """ 63 64 status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs, 65 int(A), lda, int(B), 66 ldb, int(devInfo)) 67 cusolver.cusolverCheckStatus(status) 68 69 # DPOTRS 70 # TODO: Are they still missing in skucda? 71 cusolver._libcusolver.cusolverDnDpotrs.restype = int 72 cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p, 73 cusolver.ctypes.c_int, 74 cusolver.ctypes.c_int, 75 cusolver.ctypes.c_int, 76 cusolver.ctypes.c_void_p, 77 cusolver.ctypes.c_int, 78 cusolver.ctypes.c_void_p, 79 cusolver.ctypes.c_int, 80 cusolver.ctypes.c_void_p] 81 82 def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, 83 B, ldb, devInfo): 84 status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs, 85 int(A), lda, int(B), 86 ldb, int(devInfo)) 87 cusolver.cusolverCheckStatus(status) 88 89 90def attach_cusolver_handle_to_context(ctx): 91 handle = getattr(ctx, 'cusolver_handle', None) 92 if handle is None: 93 with ctx: 94 ctx.cusolver_handle = cusolver.cusolverDnCreate() 95 96 97def attach_cublas_handle_to_context(ctx): 98 handle = getattr(ctx, 'cublas_handle', None) 99 if handle is None: 100 with ctx: 101 ctx.cublas_handle = cublas.cublasCreate() 102 103 104# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE 105MATRIX_STRUCTURES_SOLVE = ( 106 'general', 107 'symmetric', 108 'lower_triangular', 109 'upper_triangular') 110 111 112class GpuCusolverSolve(Op): 113 """ 114 CUSOLVER GPU solver OP. 115 116 Parameters 117 ---------- 118 trans 119 Whether to take the transpose of the input matrix or not. 120 121 """ 122 123 __props__ = ('A_structure', 'trans', 'inplace') 124 125 def __init__(self, A_structure='general', trans='N', inplace=False): 126 self.trans = trans 127 self.inplace = inplace 128 self.A_structure = A_structure 129 if self.inplace: 130 self.destroy_map = {0: [0]} 131 assert A_structure in MATRIX_STRUCTURES_SOLVE 132 super(GpuCusolverSolve, self).__init__() 133 134 def make_node(self, inp1, inp2): 135 if not cusolver_available: 136 raise RuntimeError('CUSOLVER is not available and ' 137 'GpuCusolverSolve Op can not be constructed.') 138 if skcuda.__version__ <= '0.5.1': 139 warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8') 140 context_name = infer_context_name(inp1, inp2) 141 142 inp1 = as_gpuarray_variable(inp1, context_name) 143 inp2 = as_gpuarray_variable(inp2, context_name) 144 145 inp1 = gpu_contiguous(inp1) 146 inp2 = gpu_contiguous(inp2) 147 148 assert inp1.ndim == 2 149 assert inp2.ndim == 2 150 assert inp1.dtype == inp2.dtype 151 152 return theano.Apply( 153 self, [inp1, inp2], 154 [GpuArrayType(inp1.dtype, 155 broadcastable=inp1.broadcastable, 156 context_name=context_name)()]) 157 158 def prepare_node(self, node, storage_map, compute_map, impl): 159 ctx = node.inputs[0].type.context 160 attach_cusolver_handle_to_context(ctx) 161 162 def check_dev_info(self, dev_info): 163 val = np.asarray(dev_info)[0] 164 if val > 0: 165 raise LinAlgError('A is singular') 166 167 def perform(self, node, inputs, outputs): 168 context = inputs[0][0].context 169 170 # Size of the matrices to invert. 171 z = outputs[0] 172 173 # Matrix. 174 A = inputs[0] 175 176 # Solution vectors. 177 b = inputs[1] 178 179 assert(len(A.shape) == 2) 180 assert(len(b.shape) == 2) 181 182 if self.trans in ['T', 'C']: 183 trans = 1 184 l, n = A.shape 185 k, m = b.shape 186 elif self.trans == 'N': 187 trans = 0 188 n, l = A.shape 189 k, m = b.shape 190 else: 191 raise ValueError('Invalid value for trans') 192 if l != n: 193 raise ValueError('A must be a square matrix') 194 if n != k: 195 raise ValueError('A and b must be aligned.') 196 197 lda = max(1, n) 198 ldb = max(1, k) 199 200 # We copy A and b as cusolver operates inplace 201 b = pygpu.array(b, copy=True, order='F') 202 if not self.inplace: 203 A = pygpu.array(A, copy=True) 204 A_ptr = A.gpudata 205 b_ptr = b.gpudata 206 207 # cusolver expects a F ordered matrix, but A is not explicitly 208 # converted between C and F order, instead we switch the 209 # "transpose" flag. 210 if A.flags['C_CONTIGUOUS']: 211 trans = 1 - trans 212 213 if A.dtype == 'float32': 214 potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize 215 potrf = cusolver.cusolverDnSpotrf 216 potrs = cusolverDnSpotrs 217 getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize 218 getrf = cusolver.cusolverDnSgetrf 219 getrs = cusolver.cusolverDnSgetrs 220 elif A.dtype == 'float64': 221 potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize 222 potrf = cusolver.cusolverDnDpotrf 223 potrs = cusolverDnDpotrs 224 getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize 225 getrf = cusolver.cusolverDnDgetrf 226 getrs = cusolver.cusolverDnDgetrs 227 else: 228 raise ValueError("Unsupported dtype") 229 230 if self.A_structure == 'symmetric': 231 with context: 232 workspace_size = potrf_bufferSize( 233 context.cusolver_handle, 0, n, A_ptr, lda) 234 235 workspace = pygpu.zeros(workspace_size, dtype=A.dtype, 236 context=context) 237 238 dev_info = pygpu.zeros((1,), dtype='int32', context=context) 239 240 workspace_ptr = workspace.gpudata 241 dev_info_ptr = dev_info.gpudata 242 243 with context: 244 potrf( 245 context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, 246 workspace_size, dev_info_ptr) 247 self.check_dev_info(dev_info) 248 249 potrs( 250 context.cusolver_handle, 0, n, m, A_ptr, lda, 251 b_ptr, ldb, dev_info_ptr) 252 253 else: 254 # general case for A 255 with context: 256 workspace_size = getrf_bufferSize( 257 context.cusolver_handle, n, n, A_ptr, lda) 258 259 workspace = pygpu.zeros(workspace_size, dtype=A.dtype, 260 context=context) 261 262 pivots = pygpu.zeros(n, dtype='int32', context=context) 263 264 dev_info = pygpu.zeros((1,), dtype='int32', context=context) 265 266 workspace_ptr = workspace.gpudata 267 pivots_ptr = pivots.gpudata 268 dev_info_ptr = dev_info.gpudata 269 270 with context: 271 getrf( 272 context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, 273 pivots_ptr, dev_info_ptr) 274 self.check_dev_info(dev_info) 275 276 getrs( 277 context.cusolver_handle, trans, n, m, A_ptr, lda, 278 pivots_ptr, b_ptr, ldb, dev_info_ptr) 279 280 z[0] = b 281 282 def L_op(self, inputs, outputs, output_gradients): 283 # Modified from theano/tensor/slinalg.py 284 A, b = inputs 285 c = outputs[0] 286 c_bar = output_gradients[0] 287 # FIXME: triangular structure would use GpuCublasTriangularsolve? 288 # no need to handle A_structure like slinalg.py? 289 trans_solve_op = GpuCusolverSolve('general') 290 b_bar = trans_solve_op(A.T, c_bar) 291 A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) 292 return [A_bar, b_bar] 293 294 295class GpuCublasTriangularSolve(Op): 296 """ 297 CUBLAS GPU Triangular Solve Op. 298 299 Parameters 300 ---------- 301 lower 302 Whether system is lower-triangular (True) or upper-triangular (False). 303 trans 304 Whether to take the transpose of the input matrix or not. 305 """ 306 __props__ = ('trans', 'lower') 307 308 def __init__(self, lower=True, trans='N'): 309 self.trans = trans 310 self.lower = lower 311 super(GpuCublasTriangularSolve, self).__init__() 312 313 def make_node(self, inp1, inp2): 314 if not cublas_available: 315 raise RuntimeError('CUBLAS is not available and ' 316 'GpuCublasTriangularSolve Op ' 317 'can not be constructed.') 318 context_name = infer_context_name(inp1, inp2) 319 320 inp1 = as_gpuarray_variable(inp1, context_name) 321 inp2 = as_gpuarray_variable(inp2, context_name) 322 323 inp1 = gpu_contiguous(inp1) 324 inp2 = gpu_contiguous(inp2) 325 326 assert inp1.ndim == 2 327 assert inp2.ndim in [1, 2] 328 assert inp1.dtype == inp2.dtype 329 330 return theano.Apply(self, [inp1, inp2], 331 [GpuArrayType(inp1.dtype, 332 broadcastable=inp2.broadcastable, 333 context_name=context_name)()]) 334 335 def prepare_node(self, node, storage_map, compute_map, impl): 336 ctx = node.inputs[0].type.context 337 attach_cublas_handle_to_context(ctx) 338 339 def perform(self, node, inputs, outputs): 340 ctx = node.inputs[0].type.context 341 342 # Solution set 343 x = outputs[0] 344 345 # Matrix. 346 A = inputs[0] 347 348 # right hand side 349 b = inputs[1] 350 351 assert(len(A.shape) == 2) 352 assert(len(b.shape) in [1, 2]) 353 354 # implicitly deal with the difference between C order 355 # and fortran order by flipping the trans and lower flags 356 lower = not self.lower 357 trans = self.trans 358 if trans in ['T', 'C']: 359 trans = 'N' 360 l, n = A.shape 361 elif trans == 'N': 362 trans = 'T' 363 n, l = A.shape 364 else: 365 raise ValueError('Invalid value for trans') 366 367 if b.ndim == 2: 368 k, m = b.shape 369 else: 370 k, = b.shape 371 m = 1 372 373 if l != n: 374 raise ValueError('A must be a square matrix') 375 if n != k: 376 raise ValueError('A and b must be aligned.') 377 378 lda = max(1, n) 379 ldb = max(1, k) 380 381 # solution overwrites right hand side on exit 382 b = pygpu.array(b, copy=True, order='F') 383 384 A_ptr = A.gpudata 385 b_ptr = b.gpudata 386 387 # unit scalar used for multiplication 388 alpha = 1.0 389 # indicates matrix A is on left of B 390 side = 'l' 391 # set whether upper or lower part of matrix A stored 392 uplo = 'l' if lower else 'u' 393 # indicates elements on diagonal of matrix A may not be unity 394 diag = 'n' 395 396 if A.dtype == 'float32': 397 trsv = cublas.cublasStrsv 398 trsm = cublas.cublasStrsm 399 elif A.dtype == 'float64': 400 trsv = cublas.cublasDtrsv 401 trsm = cublas.cublasDtrsm 402 else: 403 raise ValueError("Unsupported dtype") 404 405 with ctx: 406 if b.ndim == 1: 407 # matrix vector solve 408 trsv(ctx.cublas_handle, uplo, trans, diag, n, 409 A_ptr, lda, b_ptr, 1) 410 else: 411 trsm(ctx.cublas_handle, side, uplo, trans, diag, 412 n, m, alpha, A_ptr, lda, b_ptr, ldb) 413 414 x[0] = b 415 416 def L_op(self, inputs, outputs, output_gradients): 417 # Modified from theano/tensor/slinalg.py 418 A, b = inputs 419 c = outputs[0] 420 c_bar = output_gradients[0] 421 422 trans_solve_op = GpuCublasTriangularSolve(not self.lower) 423 b_bar = trans_solve_op(A.T, c_bar) 424 425 A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) 426 427 if self.lower: 428 A_bar = tensor.tril(A_bar) 429 else: 430 A_bar = tensor.triu(A_bar) 431 return [A_bar, b_bar] 432 433 434def gpu_solve(A, b, A_structure='general', trans='N'): 435 if A_structure == 'lower': 436 return GpuCublasTriangularSolve(True, trans)(A, b) 437 elif A_structure == 'upper': 438 return GpuCublasTriangularSolve(False, trans)(A, b) 439 440 return GpuCusolverSolve(A_structure, trans)(A, b) 441 442 443def gpu_solve_lower_triangular(A, b, trans='N'): 444 return GpuCublasTriangularSolve(True, trans)(A, b) 445 446 447def gpu_solve_upper_triangular(A, b, trans='N'): 448 return GpuCublasTriangularSolve(False, trans)(A, b) 449 450 451class GpuCholesky(Op): 452 """ 453 CUSOLVER GPU Cholesky Op. 454 455 Given a real positive definite matrix `A` returns either a lower 456 triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True` 457 else returns an upper triangular matrix `U` such that `A == dot(U.T, U)` 458 if `lower == False`. 459 460 Parameters 461 ---------- 462 lower 463 Whether to return a lower rather than upper triangular decomposition. 464 465 """ 466 467 __props__ = ('lower', 'inplace') 468 469 def __init__(self, lower=True, inplace=False): 470 self.lower = lower 471 self.inplace = inplace 472 if self.inplace: 473 self.destroy_map = {0: [0]} 474 super(GpuCholesky, self).__init__() 475 476 def clone_inplace(self): 477 return self.__class__(lower=self.lower, inplace=True) 478 479 def make_node(self, inp): 480 if not cusolver_available: 481 raise RuntimeError('CUSOLVER is not available and ' 482 'GpuCholesky Op can not be constructed.') 483 if skcuda.__version__ <= '0.5.1': 484 warnings.warn('The GpuCholesky op requires scikit-cuda > ' 485 '0.5.1 to work with CUDA 8') 486 if not pygpu_available: 487 raise RuntimeError('Missing pygpu or triu/tril functions.' 488 'Install or update libgpuarray.') 489 context_name = infer_context_name(inp) 490 491 inp = as_gpuarray_variable(inp, context_name) 492 493 inp = gpu_contiguous(inp) 494 495 assert inp.ndim == 2 496 497 return theano.Apply(self, [inp], [inp.type()]) 498 499 def prepare_node(self, node, storage_map, compute_map, impl): 500 ctx = node.inputs[0].type.context 501 attach_cusolver_handle_to_context(ctx) 502 503 def perform(self, node, inputs, outputs): 504 context = inputs[0][0].context 505 506 # Input matrix. 507 A = inputs[0] 508 509 l, n = A.shape 510 if l != n: 511 raise ValueError('A must be a square matrix') 512 513 lda = max(1, n) 514 515 # cusolver operates on F ordered matrices, but A is expected 516 # to be symmetric so it does not matter. 517 # We copy A if needed 518 if self.inplace: 519 L = A 520 else: 521 L = pygpu.array(A, copy=True) 522 523 # The output matrix will contain only the upper or lower 524 # triangular factorization of A. If L is C ordered (it 525 # probably is as it is the default in Theano) we just switch 526 # the fill mode parameter of cusolver 527 l_parameter = 0 if self.lower else 1 528 if L.flags['C_CONTIGUOUS']: 529 l_parameter = 1 - l_parameter 530 531 L_ptr = L.gpudata 532 533 if A.dtype == 'float32': 534 potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize 535 potrf = cusolver.cusolverDnSpotrf 536 elif A.dtype == 'float64': 537 potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize 538 potrf = cusolver.cusolverDnDpotrf 539 else: 540 raise ValueError("Unsupported dtype") 541 542 with context: 543 workspace_size = potrf_bufferSize( 544 context.cusolver_handle, l_parameter, n, L_ptr, lda) 545 546 workspace = pygpu.zeros(workspace_size, dtype=A.dtype, 547 context=context) 548 549 dev_info = pygpu.zeros((1,), dtype='int32', context=context) 550 551 workspace_ptr = workspace.gpudata 552 dev_info_ptr = dev_info.gpudata 553 554 potrf(context.cusolver_handle, l_parameter, n, L_ptr, 555 lda, workspace_ptr, workspace_size, dev_info_ptr) 556 557 val_dev_info = np.asarray(dev_info)[0] 558 if val_dev_info > 0: 559 raise LinAlgError('Cholesky decomposition failed (is A SPD?)') 560 561 # cusolver leaves the elements in the matrix outside the considered 562 # upper or lower triangle unchanged, so we need to put zeros outside 563 # the triangle 564 if self.lower: 565 tril(L) 566 else: 567 triu(L) 568 569 outputs[0][0] = L 570 571 def L_op(self, inputs, outputs, gradients): 572 # Modified from theano/tensor/slinalg.py 573 # No handling for on_error = 'nan' 574 dz = gradients[0] 575 chol_x = outputs[0] 576 577 # this is for nan mode 578 # 579 # ok = ~tensor.any(tensor.isnan(chol_x)) 580 # chol_x = tensor.switch(ok, chol_x, 1) 581 # dz = tensor.switch(ok, dz, 1) 582 583 # deal with upper triangular by converting to lower triangular 584 if not self.lower: 585 chol_x = chol_x.T 586 dz = dz.T 587 588 def tril_and_halve_diagonal(mtx): 589 """Extracts lower triangle of square matrix and halves diagonal.""" 590 return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) 591 592 def conjugate_solve_triangular(outer, inner): 593 """Computes L^{-T} P L^{-1} for lower-triangular L.""" 594 return gpu_solve_upper_triangular( 595 outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T) 596 597 s = conjugate_solve_triangular( 598 chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) 599 600 if self.lower: 601 grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) 602 else: 603 grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) 604 605 return [grad] 606 607 608def gpu_cholesky(A, lower=True): 609 return GpuCholesky(lower)(A) 610 611 612# TODO: add support for float64 613class GpuMagmaBase(COp): 614 """Base class for magma related operations. Add the necessary headers, 615 libraries and optionally the location of headers and library. 616 """ 617 def c_headers(self): 618 return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h', 619 'gpuarray_helper.h', 'magma.h'] 620 621 def c_header_dirs(self): 622 dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(), 623 config.cuda.include_path] 624 if config.magma.include_path: 625 dirs.append(config.magma.include_path) 626 return dirs 627 628 def c_libraries(self): 629 return ['magma'] 630 631 def c_lib_dirs(self): 632 if config.magma.library_path: 633 return [config.magma.library_path] 634 return [] 635 636 def prepare_node(self, node, storage_map, compute_map, impl): 637 from skcuda.magma import magma_init 638 ctx = node.inputs[0].type.context 639 if not getattr(ctx, 'is_magma_initialized', False): 640 with ctx: 641 magma_init() 642 ctx.is_magma_initialized = True 643 644 645class GpuMagmaSVD(GpuMagmaBase): 646 """Computes the svd of a matrix :math:`A` using magma library. 647 648 .. warning:: 649 650 Because of implementation constraints, this Op returns outputs 651 in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd` 652 to get them in expected order ``U, S, VT``. 653 654 """ 655 __props__ = ('full_matrices', 'compute_uv') 656 _cop_num_inputs = 1 657 _cop_num_outputs = 3 658 check_input = False 659 params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type) 660 661 def __init__(self, full_matrices=True, compute_uv=True): 662 self.full_matrices = full_matrices 663 self.compute_uv = compute_uv 664 COp.__init__(self, ['c_code/magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)') 665 666 def make_node(self, A): 667 ctx_name = infer_context_name(A) 668 A = as_gpuarray_variable(A, ctx_name) 669 A = gpu_contiguous(A) 670 if A.ndim != 2: 671 raise LinAlgError("Matrix rank error") 672 if A.dtype != 'float32': 673 raise TypeError("only `float32` is supported for now") 674 if self.compute_uv: 675 return theano.Apply(self, [A], 676 # return S, U, VT 677 [GpuArrayType(A.dtype, broadcastable=[False], 678 context_name=ctx_name)(), 679 A.type(), 680 A.type()]) 681 else: 682 return theano.Apply(self, [A], 683 # return only S 684 [GpuArrayType(A.dtype, broadcastable=[False], 685 context_name=ctx_name)()]) 686 687 def prepare_node(self, node, storage_map, compute_map, impl): 688 super(GpuMagmaSVD, self).prepare_node(node, storage_map, compute_map, impl) 689 # Check node to prevent eventual errors with old pickled nodes. 690 if self.compute_uv: 691 A, B, C = node.outputs 692 # We expect order: S (vector), U (matrix), VT (matrix) 693 assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \ 694 "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \ 695 "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order." 696 697 def get_params(self, node): 698 return self.params_type.get_params(self, context=node.inputs[0].type.context) 699 700 def infer_shape(self, node, shapes): 701 x_shape, = shapes 702 M, N = x_shape 703 K = tensor.minimum(M, N) 704 s_shape = (K, ) 705 if self.compute_uv: 706 u_shape = (M, M) if self.full_matrices else (M, K) 707 vt_shape = (N, N) if self.full_matrices else (K, N) 708 return [s_shape, u_shape, vt_shape] 709 else: 710 return [s_shape] 711 712 713def gpu_svd(a, full_matrices=1, compute_uv=1): 714 """ 715 This function performs the SVD on GPU. 716 717 Parameters 718 ---------- 719 full_matrices : bool, optional 720 If True (default), u and v have the shapes (M, M) and (N, N), 721 respectively. 722 Otherwise, the shapes are (M, K) and (K, N), respectively, 723 where K = min(M, N). 724 compute_uv : bool, optional 725 Whether or not to compute u and v in addition to s. 726 True by default. 727 728 Returns 729 ------- 730 U, V, D : matrices 731 732 """ 733 out = GpuMagmaSVD(full_matrices, compute_uv)(a) 734 if compute_uv: 735 S, U, VT = out 736 out = [U, S, VT] 737 return out 738 739 740class GpuMagmaMatrixInverse(GpuMagmaBase): 741 """Computes the inverse of a matrix :math:`A` using magma library. 742 """ 743 __props__ = ('inplace', ) 744 check_input = False 745 params_type = ParamsType(inplace=bool_t, context=gpu_context_type) 746 747 def __init__(self, inplace=False): 748 COp.__init__(self, ['c_code/magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)') 749 self.inplace = inplace 750 if self.inplace: 751 self.destroy_map = {0: [0]} 752 753 def clone_inplace(self): 754 return self.__class__(inplace=True) 755 756 def make_node(self, A): 757 ctx_name = infer_context_name(A) 758 A = as_gpuarray_variable(A, ctx_name) 759 A = gpu_contiguous(A) 760 if A.ndim != 2: 761 raise LinAlgError("Matrix rank error") 762 if A.dtype != 'float32': 763 raise TypeError("only `float32` is supported for now") 764 return theano.Apply(self, [A], [A.type()]) 765 766 def get_params(self, node): 767 return self.params_type.get_params(self, context=node.inputs[0].type.context) 768 769 def infer_shape(self, node, shapes): 770 return shapes 771 772 773def gpu_matrix_inverse(a): 774 """ 775 This function performs the matrix inverse on GPU. 776 777 Returns 778 ------- 779 a_inv: matrix 780 781 """ 782 return GpuMagmaMatrixInverse()(a) 783 784 785class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase): 786 """Computes the cholesky decomposition of a matrix :math:`A` using magma 787 library. 788 789 """ 790 __props__ = ('lower', 'inplace') 791 check_input = False 792 params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type) 793 794 def __init__(self, lower=True, inplace=False): 795 self.lower = lower 796 COp.__init__(self, ['c_code/magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)') 797 self.inplace = inplace 798 if self.inplace: 799 self.destroy_map = {0: [0]} 800 801 def clone_inplace(self): 802 return self.__class__(lower=self.lower, inplace=True) 803 804 def make_node(self, A): 805 ctx_name = infer_context_name(A) 806 A = as_gpuarray_variable(A, ctx_name) 807 A = gpu_contiguous(A) 808 if A.ndim != 2: 809 raise LinAlgError("Matrix rank error") 810 if A.dtype != 'float32': 811 raise TypeError("only `float32` is supported for now") 812 return theano.Apply(self, [A], [A.type()]) 813 814 def get_params(self, node): 815 return self.params_type.get_params(self, context=node.inputs[0].type.context) 816 817 def infer_shape(self, node, shapes): 818 return [shapes[0]] 819 820 821class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase): 822 """Computes the qr decomposition of a matrix :math:`A` using magma 823 library. 824 825 Parameters 826 ---------- 827 828 complete : If False, returns only ``R``. 829 830 .. warning:: 831 832 Because of implementation constraints, this Op returns outputs 833 in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr` 834 to get them in expected order ``Q, R``. 835 """ 836 __props__ = ('complete', ) 837 _cop_num_inputs = 1 838 _cop_num_outputs = 2 839 check_input = False 840 params_type = ParamsType(complete=bool_t, context=gpu_context_type) 841 842 def __init__(self, complete=True): 843 self.complete = complete 844 COp.__init__(self, ['c_code/magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)') 845 846 def make_node(self, A): 847 ctx_name = infer_context_name(A) 848 A = as_gpuarray_variable(A, ctx_name) 849 A = gpu_contiguous(A) 850 if A.ndim != 2: 851 raise LinAlgError("Matrix rank error") 852 if A.dtype != 'float32': 853 raise TypeError("only `float32` is supported for now") 854 if self.complete: 855 return theano.Apply(self, [A], 856 # return R, Q 857 [A.type(), A.type()]) 858 else: 859 return theano.Apply(self, [A], 860 # return R 861 [A.type()]) 862 863 def get_params(self, node): 864 return self.params_type.get_params(self, context=node.inputs[0].type.context) 865 866 867def gpu_qr(a, complete=True): 868 """ 869 This function performs the QR on GPU. 870 871 Parameters 872 ---------- 873 complete : bool, optional 874 If `False`, returns only r. 875 876 Returns 877 ------- 878 Q, R : matrices 879 880 """ 881 out = GpuMagmaQR(complete)(a) 882 if complete: 883 R, Q = out 884 out = [Q, R] 885 return out 886 887 888class GpuMagmaEigh(GpuMagmaBase): 889 """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma 890 library. 891 892 Parameters 893 ---------- 894 UPLO : Specifies whether the calculation is done with the lower triangular 895 part of matrix (`L`, default) or the upper triangular part (`U`). 896 compute_v : If `True`, computes eigenvalues and eigenvectors (`True`, 897 default). If `False`, computes only eigenvalues of matrix. 898 """ 899 __props__ = ('lower', 'compute_v') 900 _cop_num_inputs = 1 901 _cop_num_outputs = 2 902 check_input = False 903 params_type = ParamsType(lower=bool_t, compute_v=bool_t, 904 context=gpu_context_type) 905 906 def __init__(self, UPLO='L', compute_v=True): 907 assert UPLO in ['L', 'U'] 908 self.lower = UPLO == 'L' 909 self.compute_v = compute_v 910 COp.__init__(self, ['c_code/magma_eigh.c'], 'APPLY_SPECIFIC(magma_eigh)') 911 912 def make_node(self, A): 913 ctx_name = infer_context_name(A) 914 A = as_gpuarray_variable(A, ctx_name) 915 A = gpu_contiguous(A) 916 if A.ndim != 2: 917 raise LinAlgError("Matrix rank error") 918 if A.dtype != 'float32': 919 raise TypeError("only `float32` is supported for now") 920 if self.compute_v: 921 return theano.Apply(self, [A], 922 # return D, V 923 [GpuArrayType(A.dtype, broadcastable=[False], 924 context_name=ctx_name)(), 925 A.type()]) 926 else: 927 return theano.Apply(self, [A], 928 # return D 929 [GpuArrayType(A.dtype, broadcastable=[False], 930 context_name=ctx_name)()]) 931 932 def get_params(self, node): 933 return self.params_type.get_params(self, context=node.inputs[0].type.context) 934