1 /* ========================================================================== */
2 /* === MatrixOps/t_cholmod_sdmult =========================================== */
3 /* ========================================================================== */
4
5 /* -----------------------------------------------------------------------------
6 * CHOLMOD/MatrixOps Module. Copyright (C) 2005-2006, Timothy A. Davis
7 * http://www.suitesparse.com
8 * -------------------------------------------------------------------------- */
9
10 /* Template routine for cholmod_sdmult */
11
12 #include "cholmod_template.h"
13
14 #undef ADVANCE
15
16 #ifdef REAL
17 #define ADVANCE(x,z,d) x += d
18 #elif defined (COMPLEX)
19 #define ADVANCE(x,z,d) x += 2*d
20 #else
21 #define ADVANCE(x,z,d) x += d ; z += d
22 #endif
23
24 /* ========================================================================== */
25 /* === t_cholmod_sdmult ===================================================== */
26 /* ========================================================================== */
27
TEMPLATE(cholmod_sdmult)28 static void TEMPLATE (cholmod_sdmult)
29 (
30 /* ---- input ---- */
31 cholmod_sparse *A, /* sparse matrix to multiply */
32 int transpose, /* use A if 0, or A' otherwise */
33 double alpha [2], /* scale factor for A */
34 double beta [2], /* scale factor for Y */
35 cholmod_dense *X, /* dense matrix to multiply */
36 /* ---- in/out --- */
37 cholmod_dense *Y, /* resulting dense matrix */
38 /* -- workspace -- */
39 double *W /* size 4*nx if needed, twice that for c/zomplex case */
40 )
41 {
42
43 double yx [8], xx [8], ax [2] ;
44 #ifdef ZOMPLEX
45 double yz [4], xz [4], az [1] ;
46 double betaz [1], alphaz [1] ;
47 #endif
48
49 double *Ax, *Az, *Xx, *Xz, *Yx, *Yz, *w, *Wz ;
50 Int *Ap, *Ai, *Anz ;
51 size_t nx, ny, dx, dy ;
52 Int packed, nrow, ncol, j, k, p, pend, kcol, i ;
53
54 /* ---------------------------------------------------------------------- */
55 /* get inputs */
56 /* ---------------------------------------------------------------------- */
57
58 #ifdef ZOMPLEX
59 betaz [0] = beta [1] ;
60 alphaz [0] = alpha [1] ;
61 #endif
62
63 ny = transpose ? A->ncol : A->nrow ; /* required length of Y */
64 nx = transpose ? A->nrow : A->ncol ; /* required length of X */
65
66 nrow = A->nrow ;
67 ncol = A->ncol ;
68
69 Ap = A->p ;
70 Anz = A->nz ;
71 Ai = A->i ;
72 Ax = A->x ;
73 Az = A->z ;
74 packed = A->packed ;
75 Xx = X->x ;
76 Xz = X->z ;
77 Yx = Y->x ;
78 Yz = Y->z ;
79 kcol = X->ncol ;
80 dy = Y->d ;
81 dx = X->d ;
82 w = W ;
83 Wz = W + 4*nx ;
84
85 /* ---------------------------------------------------------------------- */
86 /* Y = beta * Y */
87 /* ---------------------------------------------------------------------- */
88
89 if (ENTRY_IS_ZERO (beta, betaz, 0))
90 {
91 for (k = 0 ; k < kcol ; k++)
92 {
93 for (i = 0 ; i < ((Int) ny) ; i++)
94 {
95 /* y [i] = 0. ; */
96 CLEAR (Yx, Yz, i) ;
97 }
98 /* y += dy ; */
99 ADVANCE (Yx,Yz,dy) ;
100 }
101 }
102 else if (!ENTRY_IS_ONE (beta, betaz, 0))
103 {
104 for (k = 0 ; k < kcol ; k++)
105 {
106 for (i = 0 ; i < ((Int) ny) ; i++)
107 {
108 /* y [i] *= beta [0] ; */
109 MULT (Yx,Yz,i, Yx,Yz,i, beta,betaz, 0) ;
110 }
111 /* y += dy ; */
112 ADVANCE (Yx,Yz,dy) ;
113 }
114 }
115
116 if (ENTRY_IS_ZERO (alpha, alphaz, 0))
117 {
118 /* nothing else to do */
119 return ;
120 }
121
122 /* ---------------------------------------------------------------------- */
123 /* Y += alpha * op(A) * X, where op(A)=A or A' */
124 /* ---------------------------------------------------------------------- */
125
126 Yx = Y->x ;
127 Yz = Y->z ;
128
129 k = 0 ;
130
131 if (A->stype == 0)
132 {
133
134 if (transpose)
135 {
136
137 /* -------------------------------------------------------------- */
138 /* Y += alpha * A' * x, unsymmetric case */
139 /* -------------------------------------------------------------- */
140
141 if (kcol % 4 == 1)
142 {
143
144 for (j = 0 ; j < ncol ; j++)
145 {
146 /* yj = 0. ; */
147 CLEAR (yx, yz, 0) ;
148 p = Ap [j] ;
149 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
150 for ( ; p < pend ; p++)
151 {
152 /* yj += conj(Ax [p]) * x [Ai [p]] ; */
153 i = Ai [p] ;
154 ASSIGN_CONJ (ax,az,0, Ax,Az,p) ;
155 MULTADD (yx,yz,0, ax,az,0, Xx,Xz,i) ;
156 }
157 /* y [j] += alpha [0] * yj ; */
158 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
159 }
160 /* y += dy ; */
161 /* x += dx ; */
162 ADVANCE (Yx,Yz,dy) ;
163 ADVANCE (Xx,Xz,dx) ;
164 k++ ;
165
166 }
167 else if (kcol % 4 == 2)
168 {
169
170 for (j = 0 ; j < ncol ; j++)
171 {
172 /* yj0 = 0. ; */
173 /* yj1 = 0. ; */
174 CLEAR (yx,yz,0) ;
175 CLEAR (yx,yz,1) ;
176
177 p = Ap [j] ;
178 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
179 for ( ; p < pend ; p++)
180 {
181 i = Ai [p] ;
182 /* aij = conj (Ax [p]) ; */
183 ASSIGN_CONJ (ax,az,0, Ax,Az,p) ;
184
185 /* yj0 += aij * x [i ] ; */
186 /* yj1 += aij * x [i+dx] ; */
187 MULTADD (yx,yz,0, ax,az,0, Xx,Xz,i) ;
188 MULTADD (yx,yz,1, ax,az,0, Xx,Xz,i+dx) ;
189 }
190 /* y [j ] += alpha [0] * yj0 ; */
191 /* y [j+dy] += alpha [0] * yj1 ; */
192 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
193 MULTADD (Yx,Yz,j+dy, alpha,alphaz,0, yx,yz,1) ;
194 }
195 /* y += 2*dy ; */
196 /* x += 2*dx ; */
197 ADVANCE (Yx,Yz,2*dy) ;
198 ADVANCE (Xx,Xz,2*dx) ;
199 k += 2 ;
200
201 }
202 else if (kcol % 4 == 3)
203 {
204
205 for (j = 0 ; j < ncol ; j++)
206 {
207 /* yj0 = 0. ; */
208 /* yj1 = 0. ; */
209 /* yj2 = 0. ; */
210 CLEAR (yx,yz,0) ;
211 CLEAR (yx,yz,1) ;
212 CLEAR (yx,yz,2) ;
213
214 p = Ap [j] ;
215 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
216 for ( ; p < pend ; p++)
217 {
218 i = Ai [p] ;
219 /* aij = conj (Ax [p]) ; */
220 ASSIGN_CONJ (ax,az,0, Ax,Az,p) ;
221
222 /* yj0 += aij * x [i ] ; */
223 /* yj1 += aij * x [i+ dx] ; */
224 /* yj2 += aij * x [i+2*dx] ; */
225 MULTADD (yx,yz,0, ax,az,0, Xx,Xz,i) ;
226 MULTADD (yx,yz,1, ax,az,0, Xx,Xz,i+dx) ;
227 MULTADD (yx,yz,2, ax,az,0, Xx,Xz,i+2*dx) ;
228 }
229 /* y [j ] += alpha [0] * yj0 ; */
230 /* y [j+ dy] += alpha [0] * yj1 ; */
231 /* y [j+2*dy] += alpha [0] * yj2 ; */
232 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
233 MULTADD (Yx,Yz,j+dy, alpha,alphaz,0, yx,yz,1) ;
234 MULTADD (Yx,Yz,j+2*dy, alpha,alphaz,0, yx,yz,2) ;
235 }
236 /* y += 3*dy ; */
237 /* x += 3*dx ; */
238 ADVANCE (Yx,Yz,3*dy) ;
239 ADVANCE (Xx,Xz,3*dx) ;
240 k += 3 ;
241 }
242
243 for ( ; k < kcol ; k += 4)
244 {
245 for (j = 0 ; j < ncol ; j++)
246 {
247 /* yj0 = 0. ; */
248 /* yj1 = 0. ; */
249 /* yj2 = 0. ; */
250 /* yj3 = 0. ; */
251 CLEAR (yx,yz,0) ;
252 CLEAR (yx,yz,1) ;
253 CLEAR (yx,yz,2) ;
254 CLEAR (yx,yz,3) ;
255
256 p = Ap [j] ;
257 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
258 for ( ; p < pend ; p++)
259 {
260 i = Ai [p] ;
261 /* aij = conj(Ax [p]) ; */
262 ASSIGN_CONJ (ax,az,0, Ax,Az,p) ;
263
264 /* yj0 += aij * x [i ] ; */
265 /* yj1 += aij * x [i+ dx] ; */
266 /* yj2 += aij * x [i+2*dx] ; */
267 /* yj3 += aij * x [i+3*dx] ; */
268 MULTADD (yx,yz,0, ax,az,0, Xx,Xz,i) ;
269 MULTADD (yx,yz,1, ax,az,0, Xx,Xz,i+dx) ;
270 MULTADD (yx,yz,2, ax,az,0, Xx,Xz,i+2*dx) ;
271 MULTADD (yx,yz,3, ax,az,0, Xx,Xz,i+3*dx) ;
272
273 }
274 /* y [j ] += alpha [0] * yj0 ; */
275 /* y [j+ dy] += alpha [0] * yj1 ; */
276 /* y [j+2*dy] += alpha [0] * yj2 ; */
277 /* y [j+3*dy] += alpha [0] * yj3 ; */
278 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
279 MULTADD (Yx,Yz,j+dy, alpha,alphaz,0, yx,yz,1) ;
280 MULTADD (Yx,Yz,j+2*dy, alpha,alphaz,0, yx,yz,2) ;
281 MULTADD (Yx,Yz,j+3*dy, alpha,alphaz,0, yx,yz,3) ;
282 }
283 /* y += 4*dy ; */
284 /* x += 4*dx ; */
285 ADVANCE (Yx,Yz,4*dy) ;
286 ADVANCE (Xx,Xz,4*dx) ;
287 }
288
289 }
290 else
291 {
292
293 /* -------------------------------------------------------------- */
294 /* Y += alpha * A * x, unsymmetric case */
295 /* -------------------------------------------------------------- */
296
297 if (kcol % 4 == 1)
298 {
299
300 for (j = 0 ; j < ncol ; j++)
301 {
302 /* xj = alpha [0] * x [j] ; */
303 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
304
305 p = Ap [j] ;
306 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
307 for ( ; p < pend ; p++)
308 {
309 /* y [Ai [p]] += Ax [p] * xj ; */
310 i = Ai [p] ;
311 MULTADD (Yx,Yz,i, Ax,Az,p, xx,xz,0) ;
312 }
313 }
314 /* y += dy ; */
315 /* x += dx ; */
316 ADVANCE (Yx,Yz,dy) ;
317 ADVANCE (Xx,Xz,dx) ;
318 k++ ;
319
320 }
321 else if (kcol % 4 == 2)
322 {
323
324 for (j = 0 ; j < ncol ; j++)
325 {
326 /* xj0 = alpha [0] * x [j ] ; */
327 /* xj1 = alpha [0] * x [j+dx] ; */
328 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
329 MULT (xx,xz,1, alpha,alphaz,0, Xx,Xz,j+dx) ;
330
331 p = Ap [j] ;
332 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
333 for ( ; p < pend ; p++)
334 {
335 i = Ai [p] ;
336 /* aij = Ax [p] ; */
337 ASSIGN (ax,az,0, Ax,Az,p) ;
338
339 /* y [i ] += aij * xj0 ; */
340 /* y [i+dy] += aij * xj1 ; */
341 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
342 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
343 }
344 }
345 /* y += 2*dy ; */
346 /* x += 2*dx ; */
347 ADVANCE (Yx,Yz,2*dy) ;
348 ADVANCE (Xx,Xz,2*dx) ;
349 k += 2 ;
350
351 }
352 else if (kcol % 4 == 3)
353 {
354
355 for (j = 0 ; j < ncol ; j++)
356 {
357 /* xj0 = alpha [0] * x [j ] ; */
358 /* xj1 = alpha [0] * x [j+ dx] ; */
359 /* xj2 = alpha [0] * x [j+2*dx] ; */
360 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
361 MULT (xx,xz,1, alpha,alphaz,0, Xx,Xz,j+dx) ;
362 MULT (xx,xz,2, alpha,alphaz,0, Xx,Xz,j+2*dx) ;
363
364 p = Ap [j] ;
365 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
366 for ( ; p < pend ; p++)
367 {
368 i = Ai [p] ;
369 /* aij = Ax [p] ; */
370 ASSIGN (ax,az,0, Ax,Az,p) ;
371
372 /* y [i ] += aij * xj0 ; */
373 /* y [i+ dy] += aij * xj1 ; */
374 /* y [i+2*dy] += aij * xj2 ; */
375 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
376 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
377 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
378 }
379 }
380 /* y += 3*dy ; */
381 /* x += 3*dx ; */
382 ADVANCE (Yx,Yz,3*dy) ;
383 ADVANCE (Xx,Xz,3*dx) ;
384 k += 3 ;
385 }
386
387 for ( ; k < kcol ; k += 4)
388 {
389 for (j = 0 ; j < ncol ; j++)
390 {
391 /* xj0 = alpha [0] * x [j ] ; */
392 /* xj1 = alpha [0] * x [j+ dx] ; */
393 /* xj2 = alpha [0] * x [j+2*dx] ; */
394 /* xj3 = alpha [0] * x [j+3*dx] ; */
395 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
396 MULT (xx,xz,1, alpha,alphaz,0, Xx,Xz,j+dx) ;
397 MULT (xx,xz,2, alpha,alphaz,0, Xx,Xz,j+2*dx) ;
398 MULT (xx,xz,3, alpha,alphaz,0, Xx,Xz,j+3*dx) ;
399
400 p = Ap [j] ;
401 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
402 for ( ; p < pend ; p++)
403 {
404 i = Ai [p] ;
405 /* aij = Ax [p] ; */
406 ASSIGN (ax,az,0, Ax,Az,p) ;
407
408 /* y [i ] += aij * xj0 ; */
409 /* y [i+ dy] += aij * xj1 ; */
410 /* y [i+2*dy] += aij * xj2 ; */
411 /* y [i+3*dy] += aij * xj3 ; */
412 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
413 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
414 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
415 MULTADD (Yx,Yz,i+3*dy, ax,az,0, xx,xz,3) ;
416 }
417 }
418 /* y += 4*dy ; */
419 /* x += 4*dx ; */
420 ADVANCE (Yx,Yz,4*dy) ;
421 ADVANCE (Xx,Xz,4*dx) ;
422 }
423 }
424
425 }
426 else
427 {
428
429 /* ------------------------------------------------------------------ */
430 /* Y += alpha * (A or A') * x, symmetric case (upper/lower) */
431 /* ------------------------------------------------------------------ */
432
433 /* Only the upper/lower triangular part and the diagonal of A is used.
434 * Since both x and y are written to in the innermost loop, this
435 * code can experience cache bank conflicts if x is used directly.
436 * Thus, a copy is made of x, four columns at a time, if x has
437 * four or more columns.
438 */
439
440 if (kcol % 4 == 1)
441 {
442
443 for (j = 0 ; j < ncol ; j++)
444 {
445 /* yj = 0. ; */
446 CLEAR (yx,yz,0) ;
447
448 /* xj = alpha [0] * x [j] ; */
449 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
450
451 p = Ap [j] ;
452 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
453 for ( ; p < pend ; p++)
454 {
455 i = Ai [p] ;
456 if (i == j)
457 {
458 /* y [i] += Ax [p] * xj ; */
459 MULTADD (Yx,Yz,i, Ax,Az,p, xx,xz,0) ;
460 }
461 else if ((A->stype > 0 && i < j) || (A->stype < 0 && i > j))
462 {
463 /* aij = Ax [p] ; */
464 ASSIGN (ax,az,0, Ax,Az,p) ;
465
466 /* y [i] += aij * xj ; */
467 /* yj += aij * x [i] ; */
468 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
469 MULTADDCONJ (yx,yz,0, ax,az,0, Xx,Xz,i) ;
470
471
472 }
473 }
474 /* y [j] += alpha [0] * yj ; */
475 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
476
477 }
478 /* y += dy ; */
479 /* x += dx ; */
480 ADVANCE (Yx,Yz,dy) ;
481 ADVANCE (Xx,Xz,dx) ;
482 k++ ;
483
484 }
485 else if (kcol % 4 == 2)
486 {
487
488 for (j = 0 ; j < ncol ; j++)
489 {
490 /* yj0 = 0. ; */
491 /* yj1 = 0. ; */
492 CLEAR (yx,yz,0) ;
493 CLEAR (yx,yz,1) ;
494
495 /* xj0 = alpha [0] * x [j ] ; */
496 /* xj1 = alpha [0] * x [j+dx] ; */
497 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
498 MULT (xx,xz,1, alpha,alphaz,0, Xx,Xz,j+dx) ;
499
500 p = Ap [j] ;
501 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
502 for ( ; p < pend ; p++)
503 {
504 i = Ai [p] ;
505 if (i == j)
506 {
507 /* aij = Ax [p] ; */
508 ASSIGN (ax,az,0, Ax,Az,p) ;
509
510 /* y [i ] += aij * xj0 ; */
511 /* y [i+dy] += aij * xj1 ; */
512 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
513 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
514
515 }
516 else if ((A->stype > 0 && i < j) || (A->stype < 0 && i > j))
517 {
518 /* aij = Ax [p] ; */
519 ASSIGN (ax,az,0, Ax,Az,p) ;
520
521 /* y [i ] += aij * xj0 ; */
522 /* y [i+dy] += aij * xj1 ; */
523 /* yj0 += aij * x [i ] ; */
524 /* yj1 += aij * x [i+dx] ; */
525 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
526 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
527 MULTADDCONJ (yx,yz,0, ax,az,0, Xx,Xz,i) ;
528 MULTADDCONJ (yx,yz,1, ax,az,0, Xx,Xz,i+dx) ;
529
530 }
531 }
532 /* y [j ] += alpha [0] * yj0 ; */
533 /* y [j+dy] += alpha [0] * yj1 ; */
534 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
535 MULTADD (Yx,Yz,j+dy, alpha,alphaz,0, yx,yz,1) ;
536
537 }
538 /* y += 2*dy ; */
539 /* x += 2*dx ; */
540 ADVANCE (Yx,Yz,2*dy) ;
541 ADVANCE (Xx,Xz,2*dx) ;
542 k += 2 ;
543
544 }
545 else if (kcol % 4 == 3)
546 {
547
548 for (j = 0 ; j < ncol ; j++)
549 {
550 /* yj0 = 0. ; */
551 /* yj1 = 0. ; */
552 /* yj2 = 0. ; */
553 CLEAR (yx,yz,0) ;
554 CLEAR (yx,yz,1) ;
555 CLEAR (yx,yz,2) ;
556
557 /* xj0 = alpha [0] * x [j ] ; */
558 /* xj1 = alpha [0] * x [j+ dx] ; */
559 /* xj2 = alpha [0] * x [j+2*dx] ; */
560 MULT (xx,xz,0, alpha,alphaz,0, Xx,Xz,j) ;
561 MULT (xx,xz,1, alpha,alphaz,0, Xx,Xz,j+dx) ;
562 MULT (xx,xz,2, alpha,alphaz,0, Xx,Xz,j+2*dx) ;
563
564 p = Ap [j] ;
565 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
566 for ( ; p < pend ; p++)
567 {
568 i = Ai [p] ;
569 if (i == j)
570 {
571
572 /* aij = Ax [p] ; */
573 ASSIGN (ax,az,0, Ax,Az,p) ;
574
575 /* y [i ] += aij * xj0 ; */
576 /* y [i+ dy] += aij * xj1 ; */
577 /* y [i+2*dy] += aij * xj2 ; */
578 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
579 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
580 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
581
582 }
583 else if ((A->stype > 0 && i < j) || (A->stype < 0 && i > j))
584 {
585
586 /* aij = Ax [p] ; */
587 ASSIGN (ax,az,0, Ax,Az,p) ;
588
589 /* y [i ] += aij * xj0 ; */
590 /* y [i+ dy] += aij * xj1 ; */
591 /* y [i+2*dy] += aij * xj2 ; */
592 /* yj0 += aij * x [i ] ; */
593 /* yj1 += aij * x [i+ dx] ; */
594 /* yj2 += aij * x [i+2*dx] ; */
595 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
596 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
597 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
598 MULTADDCONJ (yx,yz,0, ax,az,0, Xx,Xz,i) ;
599 MULTADDCONJ (yx,yz,1, ax,az,0, Xx,Xz,i+dx) ;
600 MULTADDCONJ (yx,yz,2, ax,az,0, Xx,Xz,i+2*dx) ;
601
602 }
603 }
604 /* y [j ] += alpha [0] * yj0 ; */
605 /* y [j+ dy] += alpha [0] * yj1 ; */
606 /* y [j+2*dy] += alpha [0] * yj2 ; */
607 MULTADD (Yx,Yz,j, alpha,alphaz,0, yx,yz,0) ;
608 MULTADD (Yx,Yz,j+dy, alpha,alphaz,0, yx,yz,1) ;
609 MULTADD (Yx,Yz,j+2*dy, alpha,alphaz,0, yx,yz,2) ;
610
611 }
612 /* y += 3*dy ; */
613 /* x += 3*dx ; */
614 ADVANCE (Yx,Yz,3*dy) ;
615 ADVANCE (Xx,Xz,3*dx) ;
616
617 k += 3 ;
618 }
619
620 /* copy four columns of X into W, and put in row form */
621 for ( ; k < kcol ; k += 4)
622 {
623
624 for (j = 0 ; j < ncol ; j++)
625 {
626 /* w [4*j ] = x [j ] ; */
627 /* w [4*j+1] = x [j+ dx] ; */
628 /* w [4*j+2] = x [j+2*dx] ; */
629 /* w [4*j+3] = x [j+3*dx] ; */
630 ASSIGN (w,Wz,4*j , Xx,Xz,j ) ;
631 ASSIGN (w,Wz,4*j+1, Xx,Xz,j+dx ) ;
632 ASSIGN (w,Wz,4*j+2, Xx,Xz,j+2*dx) ;
633 ASSIGN (w,Wz,4*j+3, Xx,Xz,j+3*dx) ;
634 }
635
636 for (j = 0 ; j < ncol ; j++)
637 {
638 /* yj0 = 0. ; */
639 /* yj1 = 0. ; */
640 /* yj2 = 0. ; */
641 /* yj3 = 0. ; */
642 CLEAR (yx,yz,0) ;
643 CLEAR (yx,yz,1) ;
644 CLEAR (yx,yz,2) ;
645 CLEAR (yx,yz,3) ;
646
647 /* xj0 = alpha [0] * w [4*j ] ; */
648 /* xj1 = alpha [0] * w [4*j+1] ; */
649 /* xj2 = alpha [0] * w [4*j+2] ; */
650 /* xj3 = alpha [0] * w [4*j+3] ; */
651 MULT (xx,xz,0, alpha,alphaz,0, w,Wz,4*j) ;
652 MULT (xx,xz,1, alpha,alphaz,0, w,Wz,4*j+1) ;
653 MULT (xx,xz,2, alpha,alphaz,0, w,Wz,4*j+2) ;
654 MULT (xx,xz,3, alpha,alphaz,0, w,Wz,4*j+3) ;
655
656 p = Ap [j] ;
657 pend = (packed) ? (Ap [j+1]) : (p + Anz [j]) ;
658 for ( ; p < pend ; p++)
659 {
660 i = Ai [p] ;
661 if (i == j)
662 {
663 /* aij = Ax [p] ; */
664 ASSIGN (ax,az,0, Ax,Az,p) ;
665
666 /* y [i ] += aij * xj0 ; */
667 /* y [i+ dy] += aij * xj1 ; */
668 /* y [i+2*dy] += aij * xj2 ; */
669 /* y [i+3*dy] += aij * xj3 ; */
670 MULTADD (Yx,Yz,i , ax,az,0, xx,xz,0) ;
671 MULTADD (Yx,Yz,i+dy , ax,az,0, xx,xz,1) ;
672 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
673 MULTADD (Yx,Yz,i+3*dy, ax,az,0, xx,xz,3) ;
674
675 }
676 else if ((A->stype > 0 && i < j) || (A->stype < 0 && i > j))
677 {
678 /* aij = Ax [p] ; */
679 ASSIGN (ax,az,0, Ax,Az,p) ;
680
681 /* y [i ] += aij * xj0 ; */
682 /* y [i+ dy] += aij * xj1 ; */
683 /* y [i+2*dy] += aij * xj2 ; */
684 /* y [i+3*dy] += aij * xj3 ; */
685 /* yj0 += aij * w [4*i ] ; */
686 /* yj1 += aij * w [4*i+1] ; */
687 /* yj2 += aij * w [4*i+2] ; */
688 /* yj3 += aij * w [4*i+3] ; */
689 MULTADD (Yx,Yz,i, ax,az,0, xx,xz,0) ;
690 MULTADD (Yx,Yz,i+dy, ax,az,0, xx,xz,1) ;
691 MULTADD (Yx,Yz,i+2*dy, ax,az,0, xx,xz,2) ;
692 MULTADD (Yx,Yz,i+3*dy, ax,az,0, xx,xz,3) ;
693 MULTADDCONJ (yx,yz,0, ax,az,0, w,Wz,4*i) ;
694 MULTADDCONJ (yx,yz,1, ax,az,0, w,Wz,4*i+1) ;
695 MULTADDCONJ (yx,yz,2, ax,az,0, w,Wz,4*i+2) ;
696 MULTADDCONJ (yx,yz,3, ax,az,0, w,Wz,4*i+3) ;
697
698 }
699 }
700 /* y [j ] += alpha [0] * yj0 ; */
701 /* y [j+ dy] += alpha [0] * yj1 ; */
702 /* y [j+2*dy] += alpha [0] * yj2 ; */
703 /* y [j+3*dy] += alpha [0] * yj3 ; */
704 MULTADD (Yx,Yz,j , alpha,alphaz,0, yx,yz,0) ;
705 MULTADD (Yx,Yz,j+dy , alpha,alphaz,0, yx,yz,1) ;
706 MULTADD (Yx,Yz,j+2*dy, alpha,alphaz,0, yx,yz,2) ;
707 MULTADD (Yx,Yz,j+3*dy, alpha,alphaz,0, yx,yz,3) ;
708
709 }
710 /* y += 4*dy ; */
711 /* x += 4*dx ; */
712 ADVANCE (Yx,Yz,4*dy) ;
713 ADVANCE (Xx,Xz,4*dx) ;
714
715 }
716 }
717 }
718
719
720 #undef PATTERN
721 #undef REAL
722 #undef COMPLEX
723 #undef ZOMPLEX
724