1from __future__ import absolute_import, division, print_function
2
3import warnings
4
5import pkg_resources
6import numpy as np
7from numpy.linalg.linalg import LinAlgError
8
9import theano
10from theano import Op, config, tensor
11from theano.scalar import bool as bool_t
12from theano.gof import COp, ParamsType
13from theano.gpuarray import GpuArrayType
14
15from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous, gpuarray_helper_inc_dir,
16                        infer_context_name)
17from .type import gpu_context_type
18
19try:
20    import pygpu
21    from pygpu.basic import triu, tril
22    pygpu_available = True
23except ImportError:
24    pygpu_available = False
25
26cusolver_available = False
27try:
28    import skcuda
29    from skcuda import cusolver
30    cusolver_available = True
31except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
32    pass
33
34cublas_available = False
35try:
36    from skcuda import cublas
37    cublas_available = True
38except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
39    pass
40
41if cusolver_available:
42    # Add cusolver call as it is missing in skcuda
43    # SPOTRS
44    cusolver._libcusolver.cusolverDnSpotrs.restype = int
45    cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
46                                                       cusolver.ctypes.c_int,
47                                                       cusolver.ctypes.c_int,
48                                                       cusolver.ctypes.c_int,
49                                                       cusolver.ctypes.c_void_p,
50                                                       cusolver.ctypes.c_int,
51                                                       cusolver.ctypes.c_void_p,
52                                                       cusolver.ctypes.c_int,
53                                                       cusolver.ctypes.c_void_p]
54
55    def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
56                         B, ldb, devInfo):
57        """
58        Solve real single precision linear system for hermitian matrices.
59        References
60        ----------
61        `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
62        """
63
64        status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
65                                                        int(A), lda, int(B),
66                                                        ldb, int(devInfo))
67        cusolver.cusolverCheckStatus(status)
68
69    # DPOTRS
70    # TODO: Are they still missing in skucda?
71    cusolver._libcusolver.cusolverDnDpotrs.restype = int
72    cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p,
73                                                       cusolver.ctypes.c_int,
74                                                       cusolver.ctypes.c_int,
75                                                       cusolver.ctypes.c_int,
76                                                       cusolver.ctypes.c_void_p,
77                                                       cusolver.ctypes.c_int,
78                                                       cusolver.ctypes.c_void_p,
79                                                       cusolver.ctypes.c_int,
80                                                       cusolver.ctypes.c_void_p]
81
82    def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda,
83                         B, ldb, devInfo):
84        status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs,
85                                                        int(A), lda, int(B),
86                                                        ldb, int(devInfo))
87        cusolver.cusolverCheckStatus(status)
88
89
90def attach_cusolver_handle_to_context(ctx):
91    handle = getattr(ctx, 'cusolver_handle', None)
92    if handle is None:
93        with ctx:
94            ctx.cusolver_handle = cusolver.cusolverDnCreate()
95
96
97def attach_cublas_handle_to_context(ctx):
98    handle = getattr(ctx, 'cublas_handle', None)
99    if handle is None:
100        with ctx:
101            ctx.cublas_handle = cublas.cublasCreate()
102
103
104# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
105MATRIX_STRUCTURES_SOLVE = (
106    'general',
107    'symmetric',
108    'lower_triangular',
109    'upper_triangular')
110
111
112class GpuCusolverSolve(Op):
113    """
114    CUSOLVER GPU solver OP.
115
116    Parameters
117    ----------
118    trans
119        Whether to take the transpose of the input matrix or not.
120
121    """
122
123    __props__ = ('A_structure', 'trans', 'inplace')
124
125    def __init__(self, A_structure='general', trans='N', inplace=False):
126        self.trans = trans
127        self.inplace = inplace
128        self.A_structure = A_structure
129        if self.inplace:
130            self.destroy_map = {0: [0]}
131        assert A_structure in MATRIX_STRUCTURES_SOLVE
132        super(GpuCusolverSolve, self).__init__()
133
134    def make_node(self, inp1, inp2):
135        if not cusolver_available:
136            raise RuntimeError('CUSOLVER is not available and '
137                               'GpuCusolverSolve Op can not be constructed.')
138        if skcuda.__version__ <= '0.5.1':
139            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
140        context_name = infer_context_name(inp1, inp2)
141
142        inp1 = as_gpuarray_variable(inp1, context_name)
143        inp2 = as_gpuarray_variable(inp2, context_name)
144
145        inp1 = gpu_contiguous(inp1)
146        inp2 = gpu_contiguous(inp2)
147
148        assert inp1.ndim == 2
149        assert inp2.ndim == 2
150        assert inp1.dtype == inp2.dtype
151
152        return theano.Apply(
153            self, [inp1, inp2],
154            [GpuArrayType(inp1.dtype,
155                          broadcastable=inp1.broadcastable,
156                          context_name=context_name)()])
157
158    def prepare_node(self, node, storage_map, compute_map, impl):
159        ctx = node.inputs[0].type.context
160        attach_cusolver_handle_to_context(ctx)
161
162    def check_dev_info(self, dev_info):
163        val = np.asarray(dev_info)[0]
164        if val > 0:
165            raise LinAlgError('A is singular')
166
167    def perform(self, node, inputs, outputs):
168        context = inputs[0][0].context
169
170        # Size of the matrices to invert.
171        z = outputs[0]
172
173        # Matrix.
174        A = inputs[0]
175
176        # Solution vectors.
177        b = inputs[1]
178
179        assert(len(A.shape) == 2)
180        assert(len(b.shape) == 2)
181
182        if self.trans in ['T', 'C']:
183            trans = 1
184            l, n = A.shape
185            k, m = b.shape
186        elif self.trans == 'N':
187            trans = 0
188            n, l = A.shape
189            k, m = b.shape
190        else:
191            raise ValueError('Invalid value for trans')
192        if l != n:
193            raise ValueError('A must be a square matrix')
194        if n != k:
195            raise ValueError('A and b must be aligned.')
196
197        lda = max(1, n)
198        ldb = max(1, k)
199
200        # We copy A and b as cusolver operates inplace
201        b = pygpu.array(b, copy=True, order='F')
202        if not self.inplace:
203            A = pygpu.array(A, copy=True)
204        A_ptr = A.gpudata
205        b_ptr = b.gpudata
206
207        # cusolver expects a F ordered matrix, but A is not explicitly
208        # converted between C and F order, instead we switch the
209        # "transpose" flag.
210        if A.flags['C_CONTIGUOUS']:
211            trans = 1 - trans
212
213        if A.dtype == 'float32':
214            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
215            potrf = cusolver.cusolverDnSpotrf
216            potrs = cusolverDnSpotrs
217            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
218            getrf = cusolver.cusolverDnSgetrf
219            getrs = cusolver.cusolverDnSgetrs
220        elif A.dtype == 'float64':
221            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
222            potrf = cusolver.cusolverDnDpotrf
223            potrs = cusolverDnDpotrs
224            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
225            getrf = cusolver.cusolverDnDgetrf
226            getrs = cusolver.cusolverDnDgetrs
227        else:
228            raise ValueError("Unsupported dtype")
229
230        if self.A_structure == 'symmetric':
231            with context:
232                workspace_size = potrf_bufferSize(
233                    context.cusolver_handle, 0, n, A_ptr, lda)
234
235            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
236                                    context=context)
237
238            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
239
240            workspace_ptr = workspace.gpudata
241            dev_info_ptr = dev_info.gpudata
242
243            with context:
244                potrf(
245                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
246                    workspace_size, dev_info_ptr)
247                self.check_dev_info(dev_info)
248
249                potrs(
250                    context.cusolver_handle, 0, n, m, A_ptr, lda,
251                    b_ptr, ldb, dev_info_ptr)
252
253        else:
254            # general case for A
255            with context:
256                workspace_size = getrf_bufferSize(
257                    context.cusolver_handle, n, n, A_ptr, lda)
258
259            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
260                                    context=context)
261
262            pivots = pygpu.zeros(n, dtype='int32', context=context)
263
264            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
265
266            workspace_ptr = workspace.gpudata
267            pivots_ptr = pivots.gpudata
268            dev_info_ptr = dev_info.gpudata
269
270            with context:
271                getrf(
272                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
273                    pivots_ptr, dev_info_ptr)
274                self.check_dev_info(dev_info)
275
276                getrs(
277                    context.cusolver_handle, trans, n, m, A_ptr, lda,
278                    pivots_ptr, b_ptr, ldb, dev_info_ptr)
279
280        z[0] = b
281
282    def L_op(self, inputs, outputs, output_gradients):
283        # Modified from theano/tensor/slinalg.py
284        A, b = inputs
285        c = outputs[0]
286        c_bar = output_gradients[0]
287        # FIXME: triangular structure would use GpuCublasTriangularsolve?
288        # no need to handle A_structure like slinalg.py?
289        trans_solve_op = GpuCusolverSolve('general')
290        b_bar = trans_solve_op(A.T, c_bar)
291        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
292        return [A_bar, b_bar]
293
294
295class GpuCublasTriangularSolve(Op):
296    """
297    CUBLAS GPU Triangular Solve Op.
298
299    Parameters
300    ----------
301    lower
302        Whether system is lower-triangular (True) or upper-triangular (False).
303    trans
304        Whether to take the transpose of the input matrix or not.
305    """
306    __props__ = ('trans', 'lower')
307
308    def __init__(self, lower=True, trans='N'):
309        self.trans = trans
310        self.lower = lower
311        super(GpuCublasTriangularSolve, self).__init__()
312
313    def make_node(self, inp1, inp2):
314        if not cublas_available:
315            raise RuntimeError('CUBLAS is not available and '
316                               'GpuCublasTriangularSolve Op '
317                               'can not be constructed.')
318        context_name = infer_context_name(inp1, inp2)
319
320        inp1 = as_gpuarray_variable(inp1, context_name)
321        inp2 = as_gpuarray_variable(inp2, context_name)
322
323        inp1 = gpu_contiguous(inp1)
324        inp2 = gpu_contiguous(inp2)
325
326        assert inp1.ndim == 2
327        assert inp2.ndim in [1, 2]
328        assert inp1.dtype == inp2.dtype
329
330        return theano.Apply(self, [inp1, inp2],
331                            [GpuArrayType(inp1.dtype,
332                                          broadcastable=inp2.broadcastable,
333                                          context_name=context_name)()])
334
335    def prepare_node(self, node, storage_map, compute_map, impl):
336        ctx = node.inputs[0].type.context
337        attach_cublas_handle_to_context(ctx)
338
339    def perform(self, node, inputs, outputs):
340        ctx = node.inputs[0].type.context
341
342        # Solution set
343        x = outputs[0]
344
345        # Matrix.
346        A = inputs[0]
347
348        # right hand side
349        b = inputs[1]
350
351        assert(len(A.shape) == 2)
352        assert(len(b.shape) in [1, 2])
353
354        # implicitly deal with the difference between C order
355        # and fortran order by flipping the trans and lower flags
356        lower = not self.lower
357        trans = self.trans
358        if trans in ['T', 'C']:
359            trans = 'N'
360            l, n = A.shape
361        elif trans == 'N':
362            trans = 'T'
363            n, l = A.shape
364        else:
365            raise ValueError('Invalid value for trans')
366
367        if b.ndim == 2:
368            k, m = b.shape
369        else:
370            k, = b.shape
371            m = 1
372
373        if l != n:
374            raise ValueError('A must be a square matrix')
375        if n != k:
376            raise ValueError('A and b must be aligned.')
377
378        lda = max(1, n)
379        ldb = max(1, k)
380
381        # solution overwrites right hand side on exit
382        b = pygpu.array(b, copy=True, order='F')
383
384        A_ptr = A.gpudata
385        b_ptr = b.gpudata
386
387        # unit scalar used for multiplication
388        alpha = 1.0
389        # indicates matrix A is on left of B
390        side = 'l'
391        # set whether upper or lower part of matrix A stored
392        uplo = 'l' if lower else 'u'
393        # indicates elements on diagonal of matrix A may not be unity
394        diag = 'n'
395
396        if A.dtype == 'float32':
397            trsv = cublas.cublasStrsv
398            trsm = cublas.cublasStrsm
399        elif A.dtype == 'float64':
400            trsv = cublas.cublasDtrsv
401            trsm = cublas.cublasDtrsm
402        else:
403            raise ValueError("Unsupported dtype")
404
405        with ctx:
406            if b.ndim == 1:
407                # matrix vector solve
408                trsv(ctx.cublas_handle, uplo, trans, diag, n,
409                     A_ptr, lda, b_ptr, 1)
410            else:
411                trsm(ctx.cublas_handle, side, uplo, trans, diag,
412                     n, m, alpha, A_ptr, lda, b_ptr, ldb)
413
414        x[0] = b
415
416    def L_op(self, inputs, outputs, output_gradients):
417        # Modified from theano/tensor/slinalg.py
418        A, b = inputs
419        c = outputs[0]
420        c_bar = output_gradients[0]
421
422        trans_solve_op = GpuCublasTriangularSolve(not self.lower)
423        b_bar = trans_solve_op(A.T, c_bar)
424
425        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
426
427        if self.lower:
428            A_bar = tensor.tril(A_bar)
429        else:
430            A_bar = tensor.triu(A_bar)
431        return [A_bar, b_bar]
432
433
434def gpu_solve(A, b, A_structure='general', trans='N'):
435    if A_structure == 'lower':
436        return GpuCublasTriangularSolve(True, trans)(A, b)
437    elif A_structure == 'upper':
438        return GpuCublasTriangularSolve(False, trans)(A, b)
439
440    return GpuCusolverSolve(A_structure, trans)(A, b)
441
442
443def gpu_solve_lower_triangular(A, b, trans='N'):
444    return GpuCublasTriangularSolve(True, trans)(A, b)
445
446
447def gpu_solve_upper_triangular(A, b, trans='N'):
448    return GpuCublasTriangularSolve(False, trans)(A, b)
449
450
451class GpuCholesky(Op):
452    """
453    CUSOLVER GPU Cholesky Op.
454
455    Given a real positive definite matrix `A` returns either a lower
456    triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
457    else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
458    if `lower == False`.
459
460    Parameters
461    ----------
462    lower
463        Whether to return a lower rather than upper triangular decomposition.
464
465    """
466
467    __props__ = ('lower', 'inplace')
468
469    def __init__(self, lower=True, inplace=False):
470        self.lower = lower
471        self.inplace = inplace
472        if self.inplace:
473            self.destroy_map = {0: [0]}
474        super(GpuCholesky, self).__init__()
475
476    def clone_inplace(self):
477        return self.__class__(lower=self.lower, inplace=True)
478
479    def make_node(self, inp):
480        if not cusolver_available:
481            raise RuntimeError('CUSOLVER is not available and '
482                               'GpuCholesky Op can not be constructed.')
483        if skcuda.__version__ <= '0.5.1':
484            warnings.warn('The GpuCholesky op requires scikit-cuda > '
485                          '0.5.1 to work with CUDA 8')
486        if not pygpu_available:
487            raise RuntimeError('Missing pygpu or triu/tril functions.'
488                               'Install or update libgpuarray.')
489        context_name = infer_context_name(inp)
490
491        inp = as_gpuarray_variable(inp, context_name)
492
493        inp = gpu_contiguous(inp)
494
495        assert inp.ndim == 2
496
497        return theano.Apply(self, [inp], [inp.type()])
498
499    def prepare_node(self, node, storage_map, compute_map, impl):
500        ctx = node.inputs[0].type.context
501        attach_cusolver_handle_to_context(ctx)
502
503    def perform(self, node, inputs, outputs):
504        context = inputs[0][0].context
505
506        # Input matrix.
507        A = inputs[0]
508
509        l, n = A.shape
510        if l != n:
511            raise ValueError('A must be a square matrix')
512
513        lda = max(1, n)
514
515        # cusolver operates on F ordered matrices, but A is expected
516        # to be symmetric so it does not matter.
517        # We copy A if needed
518        if self.inplace:
519            L = A
520        else:
521            L = pygpu.array(A, copy=True)
522
523        # The output matrix will contain only the upper or lower
524        # triangular factorization of A. If L is C ordered (it
525        # probably is as it is the default in Theano) we just switch
526        # the fill mode parameter of cusolver
527        l_parameter = 0 if self.lower else 1
528        if L.flags['C_CONTIGUOUS']:
529            l_parameter = 1 - l_parameter
530
531        L_ptr = L.gpudata
532
533        if A.dtype == 'float32':
534            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
535            potrf = cusolver.cusolverDnSpotrf
536        elif A.dtype == 'float64':
537            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
538            potrf = cusolver.cusolverDnDpotrf
539        else:
540            raise ValueError("Unsupported dtype")
541
542        with context:
543            workspace_size = potrf_bufferSize(
544                context.cusolver_handle, l_parameter, n, L_ptr, lda)
545
546            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
547                                    context=context)
548
549            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
550
551            workspace_ptr = workspace.gpudata
552            dev_info_ptr = dev_info.gpudata
553
554            potrf(context.cusolver_handle, l_parameter, n, L_ptr,
555                  lda, workspace_ptr, workspace_size, dev_info_ptr)
556
557            val_dev_info = np.asarray(dev_info)[0]
558            if val_dev_info > 0:
559                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
560
561        # cusolver leaves the elements in the matrix outside the considered
562        # upper or lower triangle unchanged, so we need to put zeros outside
563        # the triangle
564        if self.lower:
565            tril(L)
566        else:
567            triu(L)
568
569        outputs[0][0] = L
570
571    def L_op(self, inputs, outputs, gradients):
572        # Modified from theano/tensor/slinalg.py
573        # No handling for on_error = 'nan'
574        dz = gradients[0]
575        chol_x = outputs[0]
576
577        # this is for nan mode
578        #
579        # ok = ~tensor.any(tensor.isnan(chol_x))
580        # chol_x = tensor.switch(ok, chol_x, 1)
581        # dz = tensor.switch(ok, dz, 1)
582
583        # deal with upper triangular by converting to lower triangular
584        if not self.lower:
585            chol_x = chol_x.T
586            dz = dz.T
587
588        def tril_and_halve_diagonal(mtx):
589            """Extracts lower triangle of square matrix and halves diagonal."""
590            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
591
592        def conjugate_solve_triangular(outer, inner):
593            """Computes L^{-T} P L^{-1} for lower-triangular L."""
594            return gpu_solve_upper_triangular(
595                outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
596
597        s = conjugate_solve_triangular(
598            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
599
600        if self.lower:
601            grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
602        else:
603            grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
604
605        return [grad]
606
607
608def gpu_cholesky(A, lower=True):
609    return GpuCholesky(lower)(A)
610
611
612# TODO: add support for float64
613class GpuMagmaBase(COp):
614    """Base class for magma related operations. Add the necessary headers,
615    libraries and optionally the location of headers and library.
616    """
617    def c_headers(self):
618        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
619                'gpuarray_helper.h', 'magma.h']
620
621    def c_header_dirs(self):
622        dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
623                config.cuda.include_path]
624        if config.magma.include_path:
625            dirs.append(config.magma.include_path)
626        return dirs
627
628    def c_libraries(self):
629        return ['magma']
630
631    def c_lib_dirs(self):
632        if config.magma.library_path:
633            return [config.magma.library_path]
634        return []
635
636    def prepare_node(self, node, storage_map, compute_map, impl):
637        from skcuda.magma import magma_init
638        ctx = node.inputs[0].type.context
639        if not getattr(ctx, 'is_magma_initialized', False):
640            with ctx:
641                magma_init()
642                ctx.is_magma_initialized = True
643
644
645class GpuMagmaSVD(GpuMagmaBase):
646    """Computes the svd of a matrix :math:`A` using magma library.
647
648    .. warning::
649
650        Because of implementation constraints, this Op returns outputs
651        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
652        to get them in expected order ``U, S, VT``.
653
654    """
655    __props__ = ('full_matrices', 'compute_uv')
656    _cop_num_inputs = 1
657    _cop_num_outputs = 3
658    check_input = False
659    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
660
661    def __init__(self, full_matrices=True, compute_uv=True):
662        self.full_matrices = full_matrices
663        self.compute_uv = compute_uv
664        COp.__init__(self, ['c_code/magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
665
666    def make_node(self, A):
667        ctx_name = infer_context_name(A)
668        A = as_gpuarray_variable(A, ctx_name)
669        A = gpu_contiguous(A)
670        if A.ndim != 2:
671            raise LinAlgError("Matrix rank error")
672        if A.dtype != 'float32':
673            raise TypeError("only `float32` is supported for now")
674        if self.compute_uv:
675            return theano.Apply(self, [A],
676                                # return S, U, VT
677                                [GpuArrayType(A.dtype, broadcastable=[False],
678                                              context_name=ctx_name)(),
679                                 A.type(),
680                                 A.type()])
681        else:
682            return theano.Apply(self, [A],
683                                # return only S
684                                [GpuArrayType(A.dtype, broadcastable=[False],
685                                              context_name=ctx_name)()])
686
687    def prepare_node(self, node, storage_map, compute_map, impl):
688        super(GpuMagmaSVD, self).prepare_node(node, storage_map, compute_map, impl)
689        # Check node to prevent eventual errors with old pickled nodes.
690        if self.compute_uv:
691            A, B, C = node.outputs
692            # We expect order: S (vector), U (matrix), VT (matrix)
693            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
694                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
695                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
696
697    def get_params(self, node):
698        return self.params_type.get_params(self, context=node.inputs[0].type.context)
699
700    def infer_shape(self, node, shapes):
701        x_shape, = shapes
702        M, N = x_shape
703        K = tensor.minimum(M, N)
704        s_shape = (K, )
705        if self.compute_uv:
706            u_shape = (M, M) if self.full_matrices else (M, K)
707            vt_shape = (N, N) if self.full_matrices else (K, N)
708            return [s_shape, u_shape, vt_shape]
709        else:
710            return [s_shape]
711
712
713def gpu_svd(a, full_matrices=1, compute_uv=1):
714    """
715    This function performs the SVD on GPU.
716
717    Parameters
718    ----------
719    full_matrices : bool, optional
720        If True (default), u and v have the shapes (M, M) and (N, N),
721        respectively.
722        Otherwise, the shapes are (M, K) and (K, N), respectively,
723        where K = min(M, N).
724    compute_uv : bool, optional
725        Whether or not to compute u and v in addition to s.
726        True by default.
727
728    Returns
729    -------
730    U, V,  D : matrices
731
732    """
733    out = GpuMagmaSVD(full_matrices, compute_uv)(a)
734    if compute_uv:
735        S, U, VT = out
736        out = [U, S, VT]
737    return out
738
739
740class GpuMagmaMatrixInverse(GpuMagmaBase):
741    """Computes the inverse of a matrix :math:`A` using magma library.
742    """
743    __props__ = ('inplace', )
744    check_input = False
745    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
746
747    def __init__(self, inplace=False):
748        COp.__init__(self, ['c_code/magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
749        self.inplace = inplace
750        if self.inplace:
751            self.destroy_map = {0: [0]}
752
753    def clone_inplace(self):
754        return self.__class__(inplace=True)
755
756    def make_node(self, A):
757        ctx_name = infer_context_name(A)
758        A = as_gpuarray_variable(A, ctx_name)
759        A = gpu_contiguous(A)
760        if A.ndim != 2:
761            raise LinAlgError("Matrix rank error")
762        if A.dtype != 'float32':
763            raise TypeError("only `float32` is supported for now")
764        return theano.Apply(self, [A], [A.type()])
765
766    def get_params(self, node):
767        return self.params_type.get_params(self, context=node.inputs[0].type.context)
768
769    def infer_shape(self, node, shapes):
770        return shapes
771
772
773def gpu_matrix_inverse(a):
774    """
775    This function performs the matrix inverse on GPU.
776
777    Returns
778    -------
779    a_inv: matrix
780
781    """
782    return GpuMagmaMatrixInverse()(a)
783
784
785class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
786    """Computes the cholesky decomposition of a matrix :math:`A` using magma
787    library.
788
789    """
790    __props__ = ('lower', 'inplace')
791    check_input = False
792    params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)
793
794    def __init__(self, lower=True, inplace=False):
795        self.lower = lower
796        COp.__init__(self, ['c_code/magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)')
797        self.inplace = inplace
798        if self.inplace:
799            self.destroy_map = {0: [0]}
800
801    def clone_inplace(self):
802        return self.__class__(lower=self.lower, inplace=True)
803
804    def make_node(self, A):
805        ctx_name = infer_context_name(A)
806        A = as_gpuarray_variable(A, ctx_name)
807        A = gpu_contiguous(A)
808        if A.ndim != 2:
809            raise LinAlgError("Matrix rank error")
810        if A.dtype != 'float32':
811            raise TypeError("only `float32` is supported for now")
812        return theano.Apply(self, [A], [A.type()])
813
814    def get_params(self, node):
815        return self.params_type.get_params(self, context=node.inputs[0].type.context)
816
817    def infer_shape(self, node, shapes):
818        return [shapes[0]]
819
820
821class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
822    """Computes the qr decomposition of a matrix :math:`A` using magma
823    library.
824
825    Parameters
826    ----------
827
828        complete : If False, returns only ``R``.
829
830    .. warning::
831
832        Because of implementation constraints, this Op returns outputs
833        in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr`
834        to get them in expected order ``Q, R``.
835    """
836    __props__ = ('complete', )
837    _cop_num_inputs = 1
838    _cop_num_outputs = 2
839    check_input = False
840    params_type = ParamsType(complete=bool_t, context=gpu_context_type)
841
842    def __init__(self, complete=True):
843        self.complete = complete
844        COp.__init__(self, ['c_code/magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)')
845
846    def make_node(self, A):
847        ctx_name = infer_context_name(A)
848        A = as_gpuarray_variable(A, ctx_name)
849        A = gpu_contiguous(A)
850        if A.ndim != 2:
851            raise LinAlgError("Matrix rank error")
852        if A.dtype != 'float32':
853            raise TypeError("only `float32` is supported for now")
854        if self.complete:
855            return theano.Apply(self, [A],
856                                # return R, Q
857                                [A.type(), A.type()])
858        else:
859            return theano.Apply(self, [A],
860                                # return R
861                                [A.type()])
862
863    def get_params(self, node):
864        return self.params_type.get_params(self, context=node.inputs[0].type.context)
865
866
867def gpu_qr(a, complete=True):
868    """
869    This function performs the QR on GPU.
870
871    Parameters
872    ----------
873    complete : bool, optional
874        If `False`, returns only r.
875
876    Returns
877    -------
878    Q, R : matrices
879
880    """
881    out = GpuMagmaQR(complete)(a)
882    if complete:
883        R, Q = out
884        out = [Q, R]
885    return out
886
887
888class GpuMagmaEigh(GpuMagmaBase):
889    """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
890    library.
891
892    Parameters
893    ----------
894    UPLO : Specifies whether the calculation is done with the lower triangular
895           part of matrix (`L`, default) or the upper triangular part (`U`).
896    compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
897                default). If `False`, computes only eigenvalues of matrix.
898    """
899    __props__ = ('lower', 'compute_v')
900    _cop_num_inputs = 1
901    _cop_num_outputs = 2
902    check_input = False
903    params_type = ParamsType(lower=bool_t, compute_v=bool_t,
904                             context=gpu_context_type)
905
906    def __init__(self, UPLO='L', compute_v=True):
907        assert UPLO in ['L', 'U']
908        self.lower = UPLO == 'L'
909        self.compute_v = compute_v
910        COp.__init__(self, ['c_code/magma_eigh.c'], 'APPLY_SPECIFIC(magma_eigh)')
911
912    def make_node(self, A):
913        ctx_name = infer_context_name(A)
914        A = as_gpuarray_variable(A, ctx_name)
915        A = gpu_contiguous(A)
916        if A.ndim != 2:
917            raise LinAlgError("Matrix rank error")
918        if A.dtype != 'float32':
919            raise TypeError("only `float32` is supported for now")
920        if self.compute_v:
921            return theano.Apply(self, [A],
922                                # return D, V
923                                [GpuArrayType(A.dtype, broadcastable=[False],
924                                              context_name=ctx_name)(),
925                                 A.type()])
926        else:
927            return theano.Apply(self, [A],
928                                # return D
929                                [GpuArrayType(A.dtype, broadcastable=[False],
930                                              context_name=ctx_name)()])
931
932    def get_params(self, node):
933        return self.params_type.get_params(self, context=node.inputs[0].type.context)
934