1
2 /*! @file csp_blas2.c
3 * \brief Sparse BLAS 2, using some dense BLAS 2 operations
4 *
5 * <pre>
6 * -- SuperLU routine (version 3.0) --
7 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
8 * and Lawrence Berkeley National Lab.
9 * October 15, 2003
10 * </pre>
11 */
12 /*
13 * File name: csp_blas2.c
14 * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations.
15 */
16
17 #include "slu_cdefs.h"
18
19 /*
20 * Function prototypes
21 */
22 void cusolve(int, int, complex*, complex*);
23 void clsolve(int, int, complex*, complex*);
24 void cmatvec(int, int, int, complex*, complex*, complex*);
25
26 /*! \brief Solves one of the systems of equations A*x = b, or A'*x = b
27 *
28 * <pre>
29 * Purpose
30 * =======
31 *
32 * sp_ctrsv() solves one of the systems of equations
33 * A*x = b, or A'*x = b,
34 * where b and x are n element vectors and A is a sparse unit , or
35 * non-unit, upper or lower triangular matrix.
36 * No test for singularity or near-singularity is included in this
37 * routine. Such tests must be performed before calling this routine.
38 *
39 * Parameters
40 * ==========
41 *
42 * uplo - (input) char*
43 * On entry, uplo specifies whether the matrix is an upper or
44 * lower triangular matrix as follows:
45 * uplo = 'U' or 'u' A is an upper triangular matrix.
46 * uplo = 'L' or 'l' A is a lower triangular matrix.
47 *
48 * trans - (input) char*
49 * On entry, trans specifies the equations to be solved as
50 * follows:
51 * trans = 'N' or 'n' A*x = b.
52 * trans = 'T' or 't' A'*x = b.
53 * trans = 'C' or 'c' A^H*x = b.
54 *
55 * diag - (input) char*
56 * On entry, diag specifies whether or not A is unit
57 * triangular as follows:
58 * diag = 'U' or 'u' A is assumed to be unit triangular.
59 * diag = 'N' or 'n' A is not assumed to be unit
60 * triangular.
61 *
62 * L - (input) SuperMatrix*
63 * The factor L from the factorization Pr*A*Pc=L*U. Use
64 * compressed row subscripts storage for supernodes,
65 * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU.
66 *
67 * U - (input) SuperMatrix*
68 * The factor U from the factorization Pr*A*Pc=L*U.
69 * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU.
70 *
71 * x - (input/output) complex*
72 * Before entry, the incremented array X must contain the n
73 * element right-hand side vector b. On exit, X is overwritten
74 * with the solution vector x.
75 *
76 * info - (output) int*
77 * If *info = -i, the i-th argument had an illegal value.
78 * </pre>
79 */
80 int
sp_ctrsv(char * uplo,char * trans,char * diag,SuperMatrix * L,SuperMatrix * U,complex * x,SuperLUStat_t * stat,int * info)81 sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L,
82 SuperMatrix *U, complex *x, SuperLUStat_t *stat, int *info)
83 {
84 #ifdef _CRAY
85 _fcd ftcs1 = _cptofcd("L", strlen("L")),
86 ftcs2 = _cptofcd("N", strlen("N")),
87 ftcs3 = _cptofcd("U", strlen("U"));
88 #endif
89 SCformat *Lstore;
90 NCformat *Ustore;
91 complex *Lval, *Uval;
92 int incx = 1, incy = 1;
93 complex temp;
94 complex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
95 complex comp_zero = {0.0, 0.0};
96 int nrow;
97 int fsupc, nsupr, nsupc, luptr, istart, irow;
98 int i, k, iptr, jcol;
99 complex *work;
100 flops_t solve_ops;
101
102 /* Test the input parameters */
103 *info = 0;
104 if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
105 else if ( !lsame_(trans, "N") && !lsame_(trans, "T") &&
106 !lsame_(trans, "C")) *info = -2;
107 else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
108 else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
109 else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
110 if ( *info ) {
111 i = -(*info);
112 xerbla_("sp_ctrsv", &i);
113 return 0;
114 }
115
116 Lstore = L->Store;
117 Lval = Lstore->nzval;
118 Ustore = U->Store;
119 Uval = Ustore->nzval;
120 solve_ops = 0;
121
122 if ( !(work = complexCalloc(L->nrow)) )
123 ABORT("Malloc fails for work in sp_ctrsv().");
124
125 if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */
126
127 if ( lsame_(uplo, "L") ) {
128 /* Form x := inv(L)*x */
129 if ( L->nrow == 0 ) return 0; /* Quick return */
130
131 for (k = 0; k <= Lstore->nsuper; k++) {
132 fsupc = L_FST_SUPC(k);
133 istart = L_SUB_START(fsupc);
134 nsupr = L_SUB_START(fsupc+1) - istart;
135 nsupc = L_FST_SUPC(k+1) - fsupc;
136 luptr = L_NZ_START(fsupc);
137 nrow = nsupr - nsupc;
138
139 /* 1 c_div costs 10 flops */
140 solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc;
141 solve_ops += 8 * nrow * nsupc;
142
143 if ( nsupc == 1 ) {
144 for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
145 irow = L_SUB(iptr);
146 ++luptr;
147 cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
148 c_sub(&x[irow], &x[irow], &comp_zero);
149 }
150 } else {
151 #ifdef USE_VENDOR_BLAS
152 #ifdef _CRAY
153 CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
154 &x[fsupc], &incx);
155
156 CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
157 &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
158 #else
159 ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
160 &x[fsupc], &incx);
161
162 cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
163 &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
164 #endif
165 #else
166 clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
167
168 cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
169 &x[fsupc], &work[0] );
170 #endif
171
172 iptr = istart + nsupc;
173 for (i = 0; i < nrow; ++i, ++iptr) {
174 irow = L_SUB(iptr);
175 c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
176 work[i] = comp_zero;
177
178 }
179 }
180 } /* for k ... */
181
182 } else {
183 /* Form x := inv(U)*x */
184
185 if ( U->nrow == 0 ) return 0; /* Quick return */
186
187 for (k = Lstore->nsuper; k >= 0; k--) {
188 fsupc = L_FST_SUPC(k);
189 nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
190 nsupc = L_FST_SUPC(k+1) - fsupc;
191 luptr = L_NZ_START(fsupc);
192
193 /* 1 c_div costs 10 flops */
194 solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
195
196 if ( nsupc == 1 ) {
197 c_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
198 for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
199 irow = U_SUB(i);
200 cc_mult(&comp_zero, &x[fsupc], &Uval[i]);
201 c_sub(&x[irow], &x[irow], &comp_zero);
202 }
203 } else {
204 #ifdef USE_VENDOR_BLAS
205 #ifdef _CRAY
206 CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
207 &x[fsupc], &incx);
208 #else
209 ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
210 &x[fsupc], &incx);
211 #endif
212 #else
213 cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
214 #endif
215
216 for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
217 solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
218 for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1);
219 i++) {
220 irow = U_SUB(i);
221 cc_mult(&comp_zero, &x[jcol], &Uval[i]);
222 c_sub(&x[irow], &x[irow], &comp_zero);
223 }
224 }
225 }
226 } /* for k ... */
227
228 }
229 } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */
230
231 if ( lsame_(uplo, "L") ) {
232 /* Form x := inv(L')*x */
233 if ( L->nrow == 0 ) return 0; /* Quick return */
234
235 for (k = Lstore->nsuper; k >= 0; --k) {
236 fsupc = L_FST_SUPC(k);
237 istart = L_SUB_START(fsupc);
238 nsupr = L_SUB_START(fsupc+1) - istart;
239 nsupc = L_FST_SUPC(k+1) - fsupc;
240 luptr = L_NZ_START(fsupc);
241
242 solve_ops += 8 * (nsupr - nsupc) * nsupc;
243
244 for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
245 iptr = istart + nsupc;
246 for (i = L_NZ_START(jcol) + nsupc;
247 i < L_NZ_START(jcol+1); i++) {
248 irow = L_SUB(iptr);
249 cc_mult(&comp_zero, &x[irow], &Lval[i]);
250 c_sub(&x[jcol], &x[jcol], &comp_zero);
251 iptr++;
252 }
253 }
254
255 if ( nsupc > 1 ) {
256 solve_ops += 4 * nsupc * (nsupc - 1);
257 #ifdef _CRAY
258 ftcs1 = _cptofcd("L", strlen("L"));
259 ftcs2 = _cptofcd("T", strlen("T"));
260 ftcs3 = _cptofcd("U", strlen("U"));
261 CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
262 &x[fsupc], &incx);
263 #else
264 ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
265 &x[fsupc], &incx);
266 #endif
267 }
268 }
269 } else {
270 /* Form x := inv(U')*x */
271 if ( U->nrow == 0 ) return 0; /* Quick return */
272
273 for (k = 0; k <= Lstore->nsuper; k++) {
274 fsupc = L_FST_SUPC(k);
275 nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
276 nsupc = L_FST_SUPC(k+1) - fsupc;
277 luptr = L_NZ_START(fsupc);
278
279 for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
280 solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
281 for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
282 irow = U_SUB(i);
283 cc_mult(&comp_zero, &x[irow], &Uval[i]);
284 c_sub(&x[jcol], &x[jcol], &comp_zero);
285 }
286 }
287
288 /* 1 c_div costs 10 flops */
289 solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
290
291 if ( nsupc == 1 ) {
292 c_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
293 } else {
294 #ifdef _CRAY
295 ftcs1 = _cptofcd("U", strlen("U"));
296 ftcs2 = _cptofcd("T", strlen("T"));
297 ftcs3 = _cptofcd("N", strlen("N"));
298 CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
299 &x[fsupc], &incx);
300 #else
301 ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
302 &x[fsupc], &incx);
303 #endif
304 }
305 } /* for k ... */
306 }
307 } else { /* Form x := conj(inv(A'))*x */
308
309 if ( lsame_(uplo, "L") ) {
310 /* Form x := conj(inv(L'))*x */
311 if ( L->nrow == 0 ) return 0; /* Quick return */
312
313 for (k = Lstore->nsuper; k >= 0; --k) {
314 fsupc = L_FST_SUPC(k);
315 istart = L_SUB_START(fsupc);
316 nsupr = L_SUB_START(fsupc+1) - istart;
317 nsupc = L_FST_SUPC(k+1) - fsupc;
318 luptr = L_NZ_START(fsupc);
319
320 solve_ops += 8 * (nsupr - nsupc) * nsupc;
321
322 for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
323 iptr = istart + nsupc;
324 for (i = L_NZ_START(jcol) + nsupc;
325 i < L_NZ_START(jcol+1); i++) {
326 irow = L_SUB(iptr);
327 cc_conj(&temp, &Lval[i]);
328 cc_mult(&comp_zero, &x[irow], &temp);
329 c_sub(&x[jcol], &x[jcol], &comp_zero);
330 iptr++;
331 }
332 }
333
334 if ( nsupc > 1 ) {
335 solve_ops += 4 * nsupc * (nsupc - 1);
336 #ifdef _CRAY
337 ftcs1 = _cptofcd("L", strlen("L"));
338 ftcs2 = _cptofcd(trans, strlen("T"));
339 ftcs3 = _cptofcd("U", strlen("U"));
340 CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
341 &x[fsupc], &incx);
342 #else
343 ctrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr,
344 &x[fsupc], &incx);
345 #endif
346 }
347 }
348 } else {
349 /* Form x := conj(inv(U'))*x */
350 if ( U->nrow == 0 ) return 0; /* Quick return */
351
352 for (k = 0; k <= Lstore->nsuper; k++) {
353 fsupc = L_FST_SUPC(k);
354 nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
355 nsupc = L_FST_SUPC(k+1) - fsupc;
356 luptr = L_NZ_START(fsupc);
357
358 for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
359 solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
360 for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
361 irow = U_SUB(i);
362 cc_conj(&temp, &Uval[i]);
363 cc_mult(&comp_zero, &x[irow], &temp);
364 c_sub(&x[jcol], &x[jcol], &comp_zero);
365 }
366 }
367
368 /* 1 c_div costs 10 flops */
369 solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
370
371 if ( nsupc == 1 ) {
372 cc_conj(&temp, &Lval[luptr]);
373 c_div(&x[fsupc], &x[fsupc], &temp);
374 } else {
375 #ifdef _CRAY
376 ftcs1 = _cptofcd("U", strlen("U"));
377 ftcs2 = _cptofcd(trans, strlen("T"));
378 ftcs3 = _cptofcd("N", strlen("N"));
379 CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
380 &x[fsupc], &incx);
381 #else
382 ctrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr,
383 &x[fsupc], &incx);
384 #endif
385 }
386 } /* for k ... */
387 }
388 }
389
390 stat->ops[SOLVE] += solve_ops;
391 SUPERLU_FREE(work);
392 return 0;
393 }
394
395
396
397 /*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y
398 *
399 * <pre>
400 * Purpose
401 * =======
402 *
403 * sp_cgemv() performs one of the matrix-vector operations
404 * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y,
405 * where alpha and beta are scalars, x and y are vectors and A is a
406 * sparse A->nrow by A->ncol matrix.
407 *
408 * Parameters
409 * ==========
410 *
411 * TRANS - (input) char*
412 * On entry, TRANS specifies the operation to be performed as
413 * follows:
414 * TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
415 * TRANS = 'T' or 't' y := alpha*A'*x + beta*y.
416 * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y.
417 *
418 * ALPHA - (input) complex
419 * On entry, ALPHA specifies the scalar alpha.
420 *
421 * A - (input) SuperMatrix*
422 * Before entry, the leading m by n part of the array A must
423 * contain the matrix of coefficients.
424 *
425 * X - (input) complex*, array of DIMENSION at least
426 * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
427 * and at least
428 * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
429 * Before entry, the incremented array X must contain the
430 * vector x.
431 *
432 * INCX - (input) int
433 * On entry, INCX specifies the increment for the elements of
434 * X. INCX must not be zero.
435 *
436 * BETA - (input) complex
437 * On entry, BETA specifies the scalar beta. When BETA is
438 * supplied as zero then Y need not be set on input.
439 *
440 * Y - (output) complex*, array of DIMENSION at least
441 * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
442 * and at least
443 * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
444 * Before entry with BETA non-zero, the incremented array Y
445 * must contain the vector y. On exit, Y is overwritten by the
446 * updated vector y.
447 *
448 * INCY - (input) int
449 * On entry, INCY specifies the increment for the elements of
450 * Y. INCY must not be zero.
451 *
452 * ==== Sparse Level 2 Blas routine.
453 * </pre>
454 */
455 int
sp_cgemv(char * trans,complex alpha,SuperMatrix * A,complex * x,int incx,complex beta,complex * y,int incy)456 sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x,
457 int incx, complex beta, complex *y, int incy)
458 {
459
460 /* Local variables */
461 NCformat *Astore;
462 complex *Aval;
463 int info;
464 complex temp, temp1;
465 int lenx, leny, i, j, irow;
466 int iy, jx, jy, kx, ky;
467 int notran;
468 complex comp_zero = {0.0, 0.0};
469 complex comp_one = {1.0, 0.0};
470
471 notran = lsame_(trans, "N");
472 Astore = A->Store;
473 Aval = Astore->nzval;
474
475 /* Test the input parameters */
476 info = 0;
477 if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
478 else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
479 else if (incx == 0) info = 5;
480 else if (incy == 0) info = 8;
481 if (info != 0) {
482 xerbla_("sp_cgemv ", &info);
483 return 0;
484 }
485
486 /* Quick return if possible. */
487 if (A->nrow == 0 || A->ncol == 0 ||
488 c_eq(&alpha, &comp_zero) &&
489 c_eq(&beta, &comp_one))
490 return 0;
491
492
493 /* Set LENX and LENY, the lengths of the vectors x and y, and set
494 up the start points in X and Y. */
495 if (lsame_(trans, "N")) {
496 lenx = A->ncol;
497 leny = A->nrow;
498 } else {
499 lenx = A->nrow;
500 leny = A->ncol;
501 }
502 if (incx > 0) kx = 0;
503 else kx = - (lenx - 1) * incx;
504 if (incy > 0) ky = 0;
505 else ky = - (leny - 1) * incy;
506
507 /* Start the operations. In this version the elements of A are
508 accessed sequentially with one pass through A. */
509 /* First form y := beta*y. */
510 if ( !c_eq(&beta, &comp_one) ) {
511 if (incy == 1) {
512 if ( c_eq(&beta, &comp_zero) )
513 for (i = 0; i < leny; ++i) y[i] = comp_zero;
514 else
515 for (i = 0; i < leny; ++i)
516 cc_mult(&y[i], &beta, &y[i]);
517 } else {
518 iy = ky;
519 if ( c_eq(&beta, &comp_zero) )
520 for (i = 0; i < leny; ++i) {
521 y[iy] = comp_zero;
522 iy += incy;
523 }
524 else
525 for (i = 0; i < leny; ++i) {
526 cc_mult(&y[iy], &beta, &y[iy]);
527 iy += incy;
528 }
529 }
530 }
531
532 if ( c_eq(&alpha, &comp_zero) ) return 0;
533
534 if ( notran ) {
535 /* Form y := alpha*A*x + y. */
536 jx = kx;
537 if (incy == 1) {
538 for (j = 0; j < A->ncol; ++j) {
539 if ( !c_eq(&x[jx], &comp_zero) ) {
540 cc_mult(&temp, &alpha, &x[jx]);
541 for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
542 irow = Astore->rowind[i];
543 cc_mult(&temp1, &temp, &Aval[i]);
544 c_add(&y[irow], &y[irow], &temp1);
545 }
546 }
547 jx += incx;
548 }
549 } else {
550 ABORT("Not implemented.");
551 }
552 } else {
553 /* Form y := alpha*A'*x + y. */
554 jy = ky;
555 if (incx == 1) {
556 for (j = 0; j < A->ncol; ++j) {
557 temp = comp_zero;
558 for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
559 irow = Astore->rowind[i];
560 cc_mult(&temp1, &Aval[i], &x[irow]);
561 c_add(&temp, &temp, &temp1);
562 }
563 cc_mult(&temp1, &alpha, &temp);
564 c_add(&y[jy], &y[jy], &temp1);
565 jy += incy;
566 }
567 } else {
568 ABORT("Not implemented.");
569 }
570 }
571 return 0;
572 } /* sp_cgemv */
573
574