1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions are met:
5 1. Redistributions of source code must retain the above copyright
6 notice, this list of conditions and the following disclaimer.
7 2. Redistributions in binary form must reproduce the above copyright
8 notice, this list of conditions and the following disclaimer in the
9 documentation and/or other materials provided with the distribution.
10 3. Neither the name of the project nor the names of its contributors
11 may be used to endorse or promote products derived from this software
12 without specific prior written permission.
13
14 THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18 PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 #include "lis_config_win.h"
32 #endif
33 #endif
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38 #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <stdarg.h>
42 #ifdef _OPENMP
43 #include <omp.h>
44 #endif
45 #ifdef USE_MPI
46 #include <mpi.h>
47 #endif
48 #include "lislib.h"
49
50 LIS_MATVEC_XXX lis_matvec_bsc_xxx[4][4] = {
51 {lis_matvec_bsc_1x1, lis_matvec_bsc_1x2, lis_matvec_bsc_1x3, lis_matvec_bsc_1x4},
52 {lis_matvec_bsc_2x1, lis_matvec_bsc_2x2, lis_matvec_bsc_2x3, lis_matvec_bsc_2x4},
53 {lis_matvec_bsc_3x1, lis_matvec_bsc_3x2, lis_matvec_bsc_3x3, lis_matvec_bsc_3x4},
54 {lis_matvec_bsc_4x1, lis_matvec_bsc_4x2, lis_matvec_bsc_4x3, lis_matvec_bsc_4x4}
55 };
56
lis_matvec_bsc(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])57 void lis_matvec_bsc(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
58 {
59 LIS_INT i,j,k;
60 LIS_INT bi,bj,bc,bs;
61 LIS_INT nr,nc,bnr,bnc;
62 LIS_INT n;
63 LIS_SCALAR t;
64
65 n = A->n;
66 nr = A->nr;
67 nc = A->nc;
68 bnr = A->bnr;
69 bnc = A->bnc;
70 bs = bnr*bnc;
71 if( A->is_splited )
72 {
73 #ifdef _OPENMP
74 #pragma omp parallel for private(bi,i,j,t,k)
75 #endif
76 for(bi=0;bi<nr;bi++)
77 {
78 k = 0;
79 for(i=0;i<bnr;i++)
80 {
81 t = 0.0;
82 for(j=0;j<bnc;j++)
83 {
84 t += A->D->value[bi*bs+ j*bnr + i] * x[bi*bnr+j];
85 k++;
86 }
87 y[bi*bnr+i] = t;
88 }
89 }
90 #ifdef _OPENMP
91 #pragma omp parallel for private(i,j,k,bi,bj,bc)
92 #endif
93 for(bi=0;bi<nc;bi++)
94 {
95 for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
96 {
97 bj = A->L->bindex[bc] * bnr;
98 k = bc*bs;
99 for(j=0;j<bnc;j++)
100 {
101 for(i=0;i<bnr;i++)
102 {
103 y[bj+i] += A->L->value[k] * x[bi*bnc+j];
104 k++;
105 }
106 }
107 }
108 for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
109 {
110 bj = A->U->bindex[bc] * bnr;
111 k = bc*bs;
112 for(j=0;j<bnc;j++)
113 {
114 for(i=0;i<bnr;i++)
115 {
116 y[bj+i] += A->U->value[k] * x[bi*bnc+j];
117 k++;
118 }
119 }
120 }
121 }
122 }
123 else
124 {
125 #ifdef _OPENMP
126 #pragma omp parallel for private(i)
127 #endif
128 for(i=0; i<n; i++)
129 {
130 y[i] = 0.0;
131 }
132 #ifdef _OPENMP
133 #pragma omp parallel for private(i,j,k,bi,bj,bc)
134 #endif
135 for(bi=0;bi<nc;bi++)
136 {
137 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
138 {
139 bj = A->bindex[bc] * bnr;
140 k = bc*bs;
141 for(j=0;j<bnc;j++)
142 {
143 for(i=0;i<bnr;i++)
144 {
145 y[bj+i] += A->value[k] * x[bi*bnc+j];
146 k++;
147 }
148 }
149 }
150 }
151 }
152 }
153
lis_matvec_bsc_1x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])154 void lis_matvec_bsc_1x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
155 {
156 LIS_INT i,j,js,je,jj;
157 LIS_INT nr;
158 LIS_SCALAR t0;
159
160 nr = A->nr;
161 #ifdef _OPENMP
162 #pragma omp parallel for private(i,j,jj,js,je,t0)
163 #endif
164 for(i=0; i<nr; i++)
165 {
166 js = A->bptr[i];
167 je = A->bptr[i+1];
168 t0 = 0.0;
169 for(j=js;j<je;j++)
170 {
171 jj = A->bindex[j];
172 t0 += A->value[j] * x[jj];
173 }
174 y[i] = t0;
175 }
176 }
177
lis_matvec_bsc_1x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])178 void lis_matvec_bsc_1x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
179 {
180 LIS_INT i,j,js,je,jj;
181 LIS_INT nr;
182 LIS_SCALAR t0;
183
184 nr = A->nr;
185
186 #ifdef _OPENMP
187 #pragma omp parallel for private(i,j,jj,js,je,t0)
188 #endif
189 for(i=0; i<nr; i++)
190 {
191 js = A->bptr[i];
192 je = A->bptr[i+1];
193 t0 = 0.0;
194 for(j=js;j<je;j++)
195 {
196 jj = A->bindex[j];
197 t0 += A->value[j*2+0] * x[jj*2+0];
198 t0 += A->value[j*2+1] * x[jj*2+1];
199 }
200 y[i] = t0;
201 }
202 }
203
lis_matvec_bsc_1x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])204 void lis_matvec_bsc_1x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
205 {
206 LIS_INT i,j,js,je,jj;
207 LIS_INT nr;
208 LIS_SCALAR t0;
209
210 nr = A->nr;
211
212 #ifdef _OPENMP
213 #pragma omp parallel for private(i,j,jj,js,je,t0)
214 #endif
215 for(i=0; i<nr; i++)
216 {
217 js = A->bptr[i];
218 je = A->bptr[i+1];
219 t0 = 0.0;
220 for(j=js;j<je;j++)
221 {
222 jj = A->bindex[j];
223 t0 += A->value[j*3+0] * x[jj*3+0];
224 t0 += A->value[j*3+1] * x[jj*3+1];
225 t0 += A->value[j*3+2] * x[jj*3+2];
226 }
227 y[i] = t0;
228 }
229 }
230
lis_matvec_bsc_1x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])231 void lis_matvec_bsc_1x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
232 {
233 LIS_INT i,j,js,je,jj;
234 LIS_INT nr;
235 LIS_SCALAR t0;
236
237 nr = A->nr;
238
239 #ifdef _OPENMP
240 #pragma omp parallel for private(i,j,jj,js,je,t0)
241 #endif
242 for(i=0; i<nr; i++)
243 {
244 js = A->bptr[i];
245 je = A->bptr[i+1];
246 t0 = 0.0;
247 for(j=js;j<je;j++)
248 {
249 jj = A->bindex[j];
250 t0 += A->value[j*4+0] * x[jj*4+0];
251 t0 += A->value[j*4+1] * x[jj*4+1];
252 t0 += A->value[j*4+2] * x[jj*4+2];
253 t0 += A->value[j*4+3] * x[jj*4+3];
254 }
255 y[i] = t0;
256 }
257 }
258
lis_matvec_bsc_2x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])259 void lis_matvec_bsc_2x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
260 {
261 LIS_INT i,j,js,je,jj;
262 LIS_INT nr;
263 LIS_SCALAR t0,t1;
264
265 nr = A->nr;
266
267 if( A->is_splited )
268 {
269 #ifdef _OPENMP
270 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
271 #endif
272 for(i=0; i<nr; i++)
273 {
274 t0 = A->D->value[4*i+0] * x[2*i+0] + A->D->value[4*i+2] * x[2*i+1];
275 t1 = A->D->value[4*i+1] * x[2*i+0] + A->D->value[4*i+3] * x[2*i+1];
276 js = A->L->bptr[i];
277 je = A->L->bptr[i+1];
278 for(j=js;j<je;j++)
279 {
280 jj = A->L->bindex[j];
281 t0 += A->L->value[j*4+0] * x[jj*2+0];
282 t1 += A->L->value[j*4+1] * x[jj*2+0];
283 t0 += A->L->value[j*4+2] * x[jj*2+1];
284 t1 += A->L->value[j*4+3] * x[jj*2+1];
285 }
286 js = A->U->bptr[i];
287 je = A->U->bptr[i+1];
288 for(j=js;j<je;j++)
289 {
290 jj = A->U->bindex[j];
291 t0 += A->U->value[j*4+0] * x[jj*2+0];
292 t1 += A->U->value[j*4+1] * x[jj*2+0];
293 t0 += A->U->value[j*4+2] * x[jj*2+1];
294 t1 += A->U->value[j*4+3] * x[jj*2+1];
295 }
296 y[2*i+0] = t0;
297 y[2*i+1] = t1;
298 }
299 }
300 else
301 {
302 #ifdef _OPENMP
303 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
304 #endif
305 for(i=0; i<nr; i++)
306 {
307 js = A->bptr[i];
308 je = A->bptr[i+1];
309 t0 = 0.0;
310 t1 = 0.0;
311 for(j=js;j<je;j++)
312 {
313 jj = A->bindex[j];
314 t0 += A->value[j*4+0] * x[jj*2+0];
315 t1 += A->value[j*4+1] * x[jj*2+0];
316 t0 += A->value[j*4+2] * x[jj*2+1];
317 t1 += A->value[j*4+3] * x[jj*2+1];
318 }
319 y[2*i+0] = t0;
320 y[2*i+1] = t1;
321 }
322 }
323 }
324
lis_matvec_bsc_2x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])325 void lis_matvec_bsc_2x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
326 {
327 LIS_INT i,j,js,je,jj;
328 LIS_INT nr;
329 LIS_SCALAR t0,t1;
330
331 nr = A->nr;
332
333 #ifdef _OPENMP
334 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
335 #endif
336 for(i=0; i<nr; i++)
337 {
338 js = A->bptr[i];
339 je = A->bptr[i+1];
340 t0 = 0.0;
341 t1 = 0.0;
342 for(j=js;j<je;j++)
343 {
344 jj = A->bindex[j];
345 t0 += A->value[j*2+0] * x[jj];
346 t1 += A->value[j*2+1] * x[jj];
347 }
348 y[2*i+0] = t0;
349 y[2*i+1] = t1;
350 }
351 }
352
lis_matvec_bsc_2x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])353 void lis_matvec_bsc_2x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
354 {
355 LIS_INT i,j,js,je,jj;
356 LIS_INT nr;
357 LIS_SCALAR t0,t1;
358
359 nr = A->nr;
360
361 #ifdef _OPENMP
362 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
363 #endif
364 for(i=0; i<nr; i++)
365 {
366 js = A->bptr[i];
367 je = A->bptr[i+1];
368 t0 = 0.0;
369 t1 = 0.0;
370 for(j=js;j<je;j++)
371 {
372 jj = A->bindex[j];
373 t0 += A->value[j*6+0] * x[jj*3+0];
374 t1 += A->value[j*6+1] * x[jj*3+0];
375 t0 += A->value[j*6+2] * x[jj*3+1];
376 t1 += A->value[j*6+3] * x[jj*3+1];
377 t0 += A->value[j*6+4] * x[jj*3+2];
378 t1 += A->value[j*6+5] * x[jj*3+2];
379 }
380 y[2*i+0] = t0;
381 y[2*i+1] = t1;
382 }
383 }
384
lis_matvec_bsc_2x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])385 void lis_matvec_bsc_2x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
386 {
387 LIS_INT i,j,js,je,jj;
388 LIS_INT nr;
389 LIS_SCALAR t0,t1;
390
391 nr = A->nr;
392
393 #ifdef _OPENMP
394 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
395 #endif
396 for(i=0; i<nr; i++)
397 {
398 js = A->bptr[i];
399 je = A->bptr[i+1];
400 t0 = 0.0;
401 t1 = 0.0;
402 for(j=js;j<je;j++)
403 {
404 jj = A->bindex[j];
405 t0 += A->value[j*8+0] * x[jj*4+0];
406 t1 += A->value[j*8+1] * x[jj*4+0];
407 t0 += A->value[j*8+2] * x[jj*4+1];
408 t1 += A->value[j*8+3] * x[jj*4+1];
409 t0 += A->value[j*8+4] * x[jj*4+2];
410 t1 += A->value[j*8+5] * x[jj*4+2];
411 t0 += A->value[j*8+6] * x[jj*4+3];
412 t1 += A->value[j*8+7] * x[jj*4+3];
413 }
414 y[2*i+0] = t0;
415 y[2*i+1] = t1;
416 }
417 }
418
lis_matvec_bsc_3x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])419 void lis_matvec_bsc_3x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
420 {
421 LIS_INT i,j,js,je,jj;
422 LIS_INT nr;
423 LIS_SCALAR t0,t1,t2;
424
425 nr = A->nr;
426
427 #ifdef _OPENMP
428 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
429 #endif
430 for(i=0; i<nr; i++)
431 {
432 js = A->bptr[i];
433 je = A->bptr[i+1];
434 t0 = 0.0;
435 t1 = 0.0;
436 t2 = 0.0;
437 for(j=js;j<je;j++)
438 {
439 jj = A->bindex[j];
440 t0 += A->value[j*9+0] * x[jj*3+0];
441 t1 += A->value[j*9+1] * x[jj*3+0];
442 t2 += A->value[j*9+2] * x[jj*3+0];
443 t0 += A->value[j*9+3] * x[jj*3+1];
444 t1 += A->value[j*9+4] * x[jj*3+1];
445 t2 += A->value[j*9+5] * x[jj*3+1];
446 t0 += A->value[j*9+6] * x[jj*3+2];
447 t1 += A->value[j*9+7] * x[jj*3+2];
448 t2 += A->value[j*9+8] * x[jj*3+2];
449 }
450 y[3*i+0] = t0;
451 y[3*i+1] = t1;
452 y[3*i+2] = t2;
453 }
454 }
455
lis_matvec_bsc_3x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])456 void lis_matvec_bsc_3x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
457 {
458 LIS_INT i,j,js,je,jj,ii;
459 LIS_INT nr;
460 LIS_SCALAR t0,t1,t2;
461
462 nr = A->nr;
463
464 #ifdef _OPENMP
465 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,ii)
466 #endif
467 for(i=0; i<nr; i++)
468 {
469 ii = 3*i;
470 js = A->bptr[i];
471 je = A->bptr[i+1];
472 t0 = 0.0;
473 t1 = 0.0;
474 t2 = 0.0;
475 for(j=js;j<je;j++)
476 {
477 jj = A->bindex[j];
478 t0 += A->value[j*3+0] * x[jj];
479 t1 += A->value[j*3+1] * x[jj];
480 t2 += A->value[j*3+2] * x[jj];
481 }
482 y[ii+0] = t0;
483 y[ii+1] = t1;
484 y[ii+2] = t2;
485 }
486 }
487
lis_matvec_bsc_3x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])488 void lis_matvec_bsc_3x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
489 {
490 LIS_INT i,j,js,je,jj;
491 LIS_INT nr;
492 LIS_SCALAR t0,t1,t2;
493
494 nr = A->nr;
495
496 #ifdef _OPENMP
497 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
498 #endif
499 for(i=0; i<nr; i++)
500 {
501 js = A->bptr[i];
502 je = A->bptr[i+1];
503 t0 = 0.0;
504 t1 = 0.0;
505 t2 = 0.0;
506 for(j=js;j<je;j++)
507 {
508 jj = A->bindex[j];
509 t0 += A->value[j*6+0] * x[jj*2+0];
510 t1 += A->value[j*6+1] * x[jj*2+0];
511 t2 += A->value[j*6+2] * x[jj*2+0];
512 t0 += A->value[j*6+3] * x[jj*2+1];
513 t1 += A->value[j*6+4] * x[jj*2+1];
514 t2 += A->value[j*6+5] * x[jj*2+1];
515 }
516 y[3*i+0] = t0;
517 y[3*i+1] = t1;
518 y[3*i+2] = t2;
519 }
520 }
521
lis_matvec_bsc_3x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])522 void lis_matvec_bsc_3x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
523 {
524 LIS_INT i,j,js,je,jj;
525 LIS_INT nr;
526 LIS_SCALAR t0,t1,t2;
527
528 nr = A->nr;
529
530 #ifdef _OPENMP
531 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
532 #endif
533 for(i=0; i<nr; i++)
534 {
535 js = A->bptr[i];
536 je = A->bptr[i+1];
537 t0 = 0.0;
538 t1 = 0.0;
539 t2 = 0.0;
540 for(j=js;j<je;j++)
541 {
542 jj = A->bindex[j];
543 t0 += A->value[j*12+ 0] * x[jj*4+0];
544 t1 += A->value[j*12+ 1] * x[jj*4+0];
545 t2 += A->value[j*12+ 2] * x[jj*4+0];
546 t0 += A->value[j*12+ 3] * x[jj*4+1];
547 t1 += A->value[j*12+ 4] * x[jj*4+1];
548 t2 += A->value[j*12+ 5] * x[jj*4+1];
549 t0 += A->value[j*12+ 6] * x[jj*4+2];
550 t1 += A->value[j*12+ 7] * x[jj*4+2];
551 t2 += A->value[j*12+ 8] * x[jj*4+2];
552 t0 += A->value[j*12+ 9] * x[jj*4+3];
553 t1 += A->value[j*12+10] * x[jj*4+3];
554 t2 += A->value[j*12+11] * x[jj*4+3];
555 }
556 y[3*i+0] = t0;
557 y[3*i+1] = t1;
558 y[3*i+2] = t2;
559 }
560 }
561
lis_matvec_bsc_4x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])562 void lis_matvec_bsc_4x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
563 {
564 LIS_INT i,j,js,je,jj;
565 LIS_INT nr;
566 LIS_SCALAR t0,t1,t2,t3;
567
568 nr = A->nr;
569
570 #ifdef _OPENMP
571 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
572 #endif
573 for(i=0; i<nr; i++)
574 {
575 js = A->bptr[i];
576 je = A->bptr[i+1];
577 t0 = 0.0;
578 t1 = 0.0;
579 t2 = 0.0;
580 t3 = 0.0;
581 for(j=js;j<je;j++)
582 {
583 jj = A->bindex[j];
584 t0 += A->value[j*16+ 0] * x[jj*4+0];
585 t1 += A->value[j*16+ 1] * x[jj*4+0];
586 t2 += A->value[j*16+ 2] * x[jj*4+0];
587 t3 += A->value[j*16+ 3] * x[jj*4+0];
588 t0 += A->value[j*16+ 4] * x[jj*4+1];
589 t1 += A->value[j*16+ 5] * x[jj*4+1];
590 t2 += A->value[j*16+ 6] * x[jj*4+1];
591 t3 += A->value[j*16+ 7] * x[jj*4+1];
592 t0 += A->value[j*16+ 8] * x[jj*4+2];
593 t1 += A->value[j*16+ 9] * x[jj*4+2];
594 t2 += A->value[j*16+10] * x[jj*4+2];
595 t3 += A->value[j*16+11] * x[jj*4+2];
596 t0 += A->value[j*16+12] * x[jj*4+3];
597 t1 += A->value[j*16+13] * x[jj*4+3];
598 t2 += A->value[j*16+14] * x[jj*4+3];
599 t3 += A->value[j*16+15] * x[jj*4+3];
600 }
601 y[4*i+0] = t0;
602 y[4*i+1] = t1;
603 y[4*i+2] = t2;
604 y[4*i+3] = t3;
605 }
606 }
607
lis_matvec_bsc_4x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])608 void lis_matvec_bsc_4x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
609 {
610 LIS_INT i,j,js,je,jj;
611 LIS_INT nr;
612 LIS_SCALAR t0,t1,t2,t3;
613
614 nr = A->nr;
615
616 #ifdef _OPENMP
617 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
618 #endif
619 for(i=0; i<nr; i++)
620 {
621 js = A->bptr[i];
622 je = A->bptr[i+1];
623 t0 = 0.0;
624 t1 = 0.0;
625 t2 = 0.0;
626 t3 = 0.0;
627 for(j=js;j<je;j++)
628 {
629 jj = A->bindex[j];
630 t0 += A->value[j*4+ 0] * x[jj];
631 t1 += A->value[j*4+ 1] * x[jj];
632 t2 += A->value[j*4+ 2] * x[jj];
633 t3 += A->value[j*4+ 3] * x[jj];
634 }
635 y[4*i+0] = t0;
636 y[4*i+1] = t1;
637 y[4*i+2] = t2;
638 y[4*i+3] = t3;
639 }
640 }
641
lis_matvec_bsc_4x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])642 void lis_matvec_bsc_4x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
643 {
644 LIS_INT i,j,js,je,jj;
645 LIS_INT nr;
646 LIS_SCALAR t0,t1,t2,t3;
647
648 nr = A->nr;
649
650 #ifdef _OPENMP
651 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
652 #endif
653 for(i=0; i<nr; i++)
654 {
655 js = A->bptr[i];
656 je = A->bptr[i+1];
657 t0 = 0.0;
658 t1 = 0.0;
659 t2 = 0.0;
660 t3 = 0.0;
661 for(j=js;j<je;j++)
662 {
663 jj = A->bindex[j];
664 t0 += A->value[j*8+ 0] * x[jj*2+0];
665 t1 += A->value[j*8+ 1] * x[jj*2+0];
666 t2 += A->value[j*8+ 2] * x[jj*2+0];
667 t3 += A->value[j*8+ 3] * x[jj*2+0];
668 t0 += A->value[j*8+ 4] * x[jj*2+1];
669 t1 += A->value[j*8+ 5] * x[jj*2+1];
670 t2 += A->value[j*8+ 6] * x[jj*2+1];
671 t3 += A->value[j*8+ 7] * x[jj*2+1];
672 }
673 y[4*i+0] = t0;
674 y[4*i+1] = t1;
675 y[4*i+2] = t2;
676 y[4*i+3] = t3;
677 }
678 }
679
lis_matvec_bsc_4x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])680 void lis_matvec_bsc_4x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
681 {
682 LIS_INT i,j,js,je,jj;
683 LIS_INT nr;
684 LIS_SCALAR t0,t1,t2,t3;
685
686 nr = A->nr;
687
688 #ifdef _OPENMP
689 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
690 #endif
691 for(i=0; i<nr; i++)
692 {
693 js = A->bptr[i];
694 je = A->bptr[i+1];
695 t0 = 0.0;
696 t1 = 0.0;
697 t2 = 0.0;
698 t3 = 0.0;
699 for(j=js;j<je;j++)
700 {
701 jj = A->bindex[j];
702 t0 += A->value[j*12+ 0] * x[jj*3+0];
703 t1 += A->value[j*12+ 1] * x[jj*3+0];
704 t2 += A->value[j*12+ 2] * x[jj*3+0];
705 t3 += A->value[j*12+ 3] * x[jj*3+0];
706 t0 += A->value[j*12+ 4] * x[jj*3+1];
707 t1 += A->value[j*12+ 5] * x[jj*3+1];
708 t2 += A->value[j*12+ 6] * x[jj*3+1];
709 t3 += A->value[j*12+ 7] * x[jj*3+1];
710 t0 += A->value[j*12+ 8] * x[jj*3+2];
711 t1 += A->value[j*12+ 9] * x[jj*3+2];
712 t2 += A->value[j*12+10] * x[jj*3+2];
713 t3 += A->value[j*12+11] * x[jj*3+2];
714 }
715 y[4*i+0] = t0;
716 y[4*i+1] = t1;
717 y[4*i+2] = t2;
718 y[4*i+3] = t3;
719 }
720 }
721
lis_matvech_bsc(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])722 void lis_matvech_bsc(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
723 {
724 LIS_INT i,j,k;
725 LIS_INT bi,bj,bc,bs;
726 LIS_INT nr,nc,bnr,bnc;
727 LIS_INT n,np;
728 #ifdef _OPENMP
729 LIS_INT nprocs;
730 LIS_SCALAR t;
731 LIS_SCALAR *w;
732 #endif
733
734 n = A->n;
735 np = A->np;
736 nr = A->nr;
737 nc = A->nc;
738 bnr = A->bnr;
739 bnc = A->bnc;
740 bs = bnr*bnc;
741 if( A->is_splited )
742 {
743 for(i=0;i<n;i++)
744 {
745 y[i] = 0.0;
746 }
747 for(bi=0;bi<nr;bi++)
748 {
749 k = bi*bs;
750 for(j=0;j<bnc;j++)
751 {
752 for(i=0;i<bnr;i++)
753 {
754 y[bi*bnr+j] += conj(A->D->value[k++]) * x[bi*bnr+i];
755 }
756 }
757 }
758 for(bi=0;bi<nc;bi++)
759 {
760 for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
761 {
762 bj = A->L->bindex[bc] * bnr;
763 k = bc*bs;
764 for(j=0;j<bnc;j++)
765 {
766 for(i=0;i<bnr;i++)
767 {
768 y[bj+j] += conj(A->L->value[k]) * x[bi*bnr+i];
769 k++;
770 }
771 }
772 }
773 for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
774 {
775 bj = A->U->bindex[bc] * bnr;
776 k = bc*bs;
777 for(j=0;j<bnc;j++)
778 {
779 for(i=0;i<bnr;i++)
780 {
781 y[bj+j] += conj(A->U->value[k]) * x[bi*bnr+i];
782 k++;
783 }
784 }
785 }
786 }
787 }
788 else
789 {
790 #ifdef _OPENMP
791 nprocs = omp_get_max_threads();
792 w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_bsc::w" );
793
794 #pragma omp parallel private(bi,bc,bj,i,j,k)
795 {
796 #pragma omp for
797 for(j=0;j<nprocs;j++)
798 {
799 memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
800 }
801 #pragma omp for
802 for(bi=0;bi<nc;bi++)
803 {
804 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
805 {
806 bj = A->bindex[bc] * bnr;
807 k = bc*bs;
808 for(j=0;j<bnc;j++)
809 {
810 for(i=0;i<bnr;i++)
811 {
812 w[bi*bnc+j] += conj(A->value[k]) * x[bj+i];
813 k++;
814 }
815 }
816 }
817 }
818 #pragma omp barrier
819 #pragma omp for
820 for(i=0;i<np;i++)
821 {
822 t = 0.0;
823 for(j=0;j<nprocs;j++)
824 {
825 t += w[j*np+i];
826 }
827 y[i] = t;
828 }
829 }
830 lis_free(w);
831 #else
832 for(i=0; i<n; i++)
833 {
834 y[i] = 0.0;
835 }
836 for(bi=0;bi<nc;bi++)
837 {
838 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
839 {
840 bj = A->bindex[bc] * bnr;
841 k = bc*bs;
842 for(j=0;j<bnc;j++)
843 {
844 for(i=0;i<bnr;i++)
845 {
846 y[bi*bnc+j] += conj(A->value[k]) * x[bj+i];
847 k++;
848 }
849 }
850 }
851 }
852 #endif
853 }
854 }
855