1 #section kernels
2
3 #kernel triu_kernel : size, size, size, *:
4 #include "cluda.h"
5
triu_kernel(const ga_size nthreads,const ga_size ncols,const ga_size a_off,GLOBAL_MEM DTYPE_INPUT_0 * a)6 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
7 const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
8 a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
9 // grid stride looping
10 for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
11 index += LDIM_0 * GDIM_0) {
12 const ga_size ix = index / ncols;
13 const ga_size iy = index % ncols;
14 if (ix > iy) {
15 a[index] = 0.0;
16 }
17 }
18 }
19
20 #section init_code
21
22 setup_ext_cuda();
23
24 #section support_code
25
pygpu_narrow(PyGpuArrayObject * src,size_t dim,size_t size)26 static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
27 size_t size) {
28 PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
29 src_view->ga.dimensions[dim] = size;
30 GpuArray_fix_flags(&src_view->ga);
31 return pygpu_copy(src_view, GA_C_ORDER);
32 }
33
34 #section support_code_struct
35
APPLY_SPECIFIC(magma_qr)36 int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
37 PyGpuArrayObject **R,
38 PyGpuArrayObject **Q, // may be NULL
39 PARAMS_TYPE* params) {
40 PyGpuArrayObject *A = NULL;
41 magma_int_t M, N, K, nb, ldwork;
42 size_t n2;
43 float *tau_data = NULL;
44 gpudata *work_data = NULL;
45 int res = -1, info;
46 A = A_;
47
48 if (A->ga.typecode != GA_FLOAT) {
49 PyErr_SetString(PyExc_TypeError,
50 "GpuMagmaQR: Unsupported data type");
51 return -1;
52 }
53
54 // This is early to match the exit() in the fail label.
55 cuda_enter(params->context->ctx);
56
57 if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
58 PyErr_SetString(PyExc_ValueError,
59 "GpuMagmaQR: requires data to be C-contiguous");
60 goto fail;
61 }
62 if (PyGpuArray_NDIM(A) != 2) {
63 PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
64 goto fail;
65 }
66
67 A = pygpu_copy(A_, GA_F_ORDER);
68 if (A == NULL) {
69 PyErr_SetString(PyExc_RuntimeError,
70 "GpuMagmaQR: failed to change to column-major order");
71 goto fail;
72 }
73
74 // magma matrix qr
75 M = PyGpuArray_DIM(A, 0);
76 N = PyGpuArray_DIM(A, 1);
77 K = M < N ? M : N;
78
79 if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
80 PyErr_SetString(PyExc_RuntimeError,
81 "GpuMagmaQR: failed to allocate working memory");
82 goto fail;
83 }
84
85 nb = magma_get_sgeqrf_nb(M, N);
86 ldwork = (2 * K + magma_roundup(N, 32)) * nb;
87 work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
88 if (work_data == NULL) {
89 PyErr_SetString(PyExc_RuntimeError,
90 "GpuMagmaQR: failed to allocate working memory");
91 goto fail;
92 }
93
94 // compute R
95 magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
96 if (info != 0) {
97 PyErr_Format(
98 PyExc_RuntimeError,
99 "GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
100 goto fail;
101 }
102 *R = pygpu_narrow(A, 0, K);
103 if (*R == NULL) {
104 PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
105 goto fail;
106 }
107 n2 = K * N;
108 res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
109 if (res != GA_NO_ERROR) {
110 PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
111 GpuKernel_error(&k_triu_kernel, res));
112 goto fail;
113 }
114
115 if (params->complete) {
116 // compute Q
117 Py_XDECREF(A);
118 A = pygpu_copy(A_, GA_F_ORDER);
119 if (A == NULL) {
120 PyErr_SetString(PyExc_RuntimeError,
121 "GpuMagmaQR: failed to change to column-major order");
122 return -1;
123 }
124 magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
125 *(float **)work_data, &info);
126 if (info != 0) {
127 PyErr_Format(
128 PyExc_RuntimeError,
129 "GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
130 goto fail;
131 }
132
133 magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
134 *(float **)work_data, nb, &info);
135 if (info != 0) {
136 PyErr_Format(
137 PyExc_RuntimeError,
138 "GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
139 goto fail;
140 }
141 *Q = pygpu_narrow(A, 1, K);
142 if (*Q == NULL) {
143 PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
144 goto fail;
145 }
146 }
147 res = 0;
148 fail:
149 if (tau_data != NULL)
150 magma_free_pinned(tau_data);
151 if (work_data != NULL)
152 gpudata_release(work_data);
153 Py_XDECREF(A);
154 cuda_exit(params->context->ctx);
155 return res;
156 }
157