1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions are met:
5 1. Redistributions of source code must retain the above copyright
6 notice, this list of conditions and the following disclaimer.
7 2. Redistributions in binary form must reproduce the above copyright
8 notice, this list of conditions and the following disclaimer in the
9 documentation and/or other materials provided with the distribution.
10 3. Neither the name of the project nor the names of its contributors
11 may be used to endorse or promote products derived from this software
12 without specific prior written permission.
13
14 THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18 PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 #include "lis_config_win.h"
32 #endif
33 #endif
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38 #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <stdarg.h>
42 #ifdef _OPENMP
43 #include <omp.h>
44 #endif
45 #ifdef USE_MPI
46 #include <mpi.h>
47 #endif
48 #include "lislib.h"
49
50 LIS_MATVEC_XXX lis_matvec_bsr_xxx[4][4] = {
51 {lis_matvec_bsr_1x1, lis_matvec_bsr_1x2, lis_matvec_bsr_1x3, lis_matvec_bsr_1x4},
52 {lis_matvec_bsr_2x1, lis_matvec_bsr_2x2, lis_matvec_bsr_2x3, lis_matvec_bsr_2x4},
53 {lis_matvec_bsr_3x1, lis_matvec_bsr_3x2, lis_matvec_bsr_3x3, lis_matvec_bsr_3x4},
54 {lis_matvec_bsr_4x1, lis_matvec_bsr_4x2, lis_matvec_bsr_4x3, lis_matvec_bsr_4x4}
55 };
56
lis_matvec_bsr(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])57 void lis_matvec_bsr(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
58 {
59 LIS_INT i,j,k;
60 LIS_INT bi,bj,bc,bs;
61 LIS_INT nr,bnr,bnc;
62 LIS_INT n;
63
64 n = A->n;
65 nr = A->nr;
66 bnr = A->bnr;
67 bnc = A->bnc;
68 bs = bnr*bnc;
69
70 if( A->is_splited )
71 {
72 #ifdef _OPENMP
73 #pragma omp parallel for private(i)
74 #endif
75 for(i=0; i<n; i++)
76 {
77 y[i] = 0.0;
78 }
79 #ifdef _OPENMP
80 #pragma omp parallel for private(i,j,k,bi,bj,bc)
81 #endif
82 for(bi=0;bi<nr;bi++)
83 {
84 k = bi*bs;
85 for(j=0;j<bnc;j++)
86 {
87 for(i=0;i<bnr;i++)
88 {
89 y[bi*bnr+i] += A->D->value[k] * x[bi*bnr+j];
90 k++;
91 }
92 }
93 for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
94 {
95 bj = A->L->bindex[bc] * bnc;
96 k = bc*bs;
97 for(j=0;j<bnc;j++)
98 {
99 for(i=0;i<bnr;i++)
100 {
101 y[bi*bnr+i] += A->L->value[k] * x[bj+j];
102 k++;
103 }
104 }
105 }
106 for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
107 {
108 bj = A->U->bindex[bc] * bnc;
109 k = bc*bs;
110 for(j=0;j<bnc;j++)
111 {
112 for(i=0;i<bnr;i++)
113 {
114 y[bi*bnr+i] += A->U->value[k] * x[bj+j];
115 k++;
116 }
117 }
118 }
119 }
120 }
121 else
122 {
123 #ifdef _OPENMP
124 #pragma omp parallel for private(i)
125 #endif
126 for(i=0; i<n; i++)
127 {
128 y[i] = 0.0;
129 }
130 #ifdef _OPENMP
131 #pragma omp parallel for private(i,j,k,bi,bj,bc)
132 #endif
133 for(bi=0;bi<nr;bi++)
134 {
135 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
136 {
137 bj = A->bindex[bc] * bnc;
138 k = bc*bs;
139 for(j=0;j<bnc;j++)
140 {
141 for(i=0;i<bnr;i++)
142 {
143 y[bi*bnr+i] += A->value[k] * x[bj+j];
144 k++;
145 }
146 }
147 }
148 }
149 }
150 }
151
lis_matvec_bsr_1x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])152 void lis_matvec_bsr_1x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
153 {
154 LIS_INT i,j,js,je,jj;
155 LIS_INT nr;
156 LIS_SCALAR t0;
157
158 nr = A->nr;
159 if( A->is_splited )
160 {
161 #ifdef _OPENMP
162 #pragma omp parallel for private(i,j,jj,js,je,t0)
163 #endif
164 for(i=0; i<nr; i++)
165 {
166 t0 = A->D->value[i] * x[i];
167 js = A->L->bptr[i];
168 je = A->L->bptr[i+1];
169 for(j=js;j<je;j++)
170 {
171 jj = A->L->bindex[j];
172 t0 += A->L->value[j] * x[jj];
173 }
174 js = A->U->bptr[i];
175 je = A->U->bptr[i+1];
176 for(j=js;j<je;j++)
177 {
178 jj = A->U->bindex[j];
179 t0 += A->U->value[j] * x[jj];
180 }
181 y[i] = t0;
182 }
183 }
184 else
185 {
186 #ifdef _OPENMP
187 #pragma omp parallel for private(i,j,jj,js,je,t0)
188 #endif
189 for(i=0; i<nr; i++)
190 {
191 js = A->bptr[i];
192 je = A->bptr[i+1];
193 t0 = 0.0;
194 for(j=js;j<je;j++)
195 {
196 jj = A->bindex[j];
197 t0 += A->value[j] * x[jj];
198 }
199 y[i] = t0;
200 }
201 }
202 }
203
lis_matvec_bsr_1x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])204 void lis_matvec_bsr_1x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
205 {
206 LIS_INT i,j,js,je,jj;
207 LIS_INT nr;
208 LIS_SCALAR t0;
209
210 nr = A->nr;
211
212 #ifdef _OPENMP
213 #pragma omp parallel for private(i,j,jj,js,je,t0)
214 #endif
215 for(i=0; i<nr; i++)
216 {
217 js = A->bptr[i];
218 je = A->bptr[i+1];
219 t0 = 0.0;
220 for(j=js;j<je;j++)
221 {
222 jj = A->bindex[j];
223 t0 += A->value[j*2+0] * x[jj*2+0];
224 t0 += A->value[j*2+1] * x[jj*2+1];
225 }
226 y[i] = t0;
227 }
228 }
229
lis_matvec_bsr_1x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])230 void lis_matvec_bsr_1x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
231 {
232 LIS_INT i,j,js,je,jj;
233 LIS_INT nr;
234 LIS_SCALAR t0;
235
236 nr = A->nr;
237
238 #ifdef _OPENMP
239 #pragma omp parallel for private(i,j,jj,js,je,t0)
240 #endif
241 for(i=0; i<nr; i++)
242 {
243 js = A->bptr[i];
244 je = A->bptr[i+1];
245 t0 = 0.0;
246 for(j=js;j<je;j++)
247 {
248 jj = A->bindex[j];
249 t0 += A->value[j*3+0] * x[jj*3+0];
250 t0 += A->value[j*3+1] * x[jj*3+1];
251 t0 += A->value[j*3+2] * x[jj*3+2];
252 }
253 y[i] = t0;
254 }
255 }
256
lis_matvec_bsr_1x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])257 void lis_matvec_bsr_1x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
258 {
259 LIS_INT i,j,js,je,jj;
260 LIS_INT nr;
261 LIS_SCALAR t0;
262
263 nr = A->nr;
264
265 #ifdef _OPENMP
266 #pragma omp parallel for private(i,j,jj,js,je,t0)
267 #endif
268 for(i=0; i<nr; i++)
269 {
270 js = A->bptr[i];
271 je = A->bptr[i+1];
272 t0 = 0.0;
273 for(j=js;j<je;j++)
274 {
275 jj = A->bindex[j];
276 t0 += A->value[j*4+0] * x[jj*4+0];
277 t0 += A->value[j*4+1] * x[jj*4+1];
278 t0 += A->value[j*4+2] * x[jj*4+2];
279 t0 += A->value[j*4+3] * x[jj*4+3];
280 }
281 y[i] = t0;
282 }
283 }
284
lis_matvec_bsr_2x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])285 void lis_matvec_bsr_2x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
286 {
287 LIS_INT i,j,js,je,jj;
288 LIS_INT nr;
289 LIS_SCALAR t0,t1;
290
291 nr = A->nr;
292
293 if( A->is_splited )
294 {
295 #ifdef _OPENMP
296 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
297 #endif
298 for(i=0; i<nr; i++)
299 {
300 t0 = A->D->value[4*i+0] * x[2*i+0] + A->D->value[4*i+2] * x[2*i+1];
301 t1 = A->D->value[4*i+1] * x[2*i+0] + A->D->value[4*i+3] * x[2*i+1];
302 js = A->L->bptr[i];
303 je = A->L->bptr[i+1];
304 for(j=js;j<je;j++)
305 {
306 jj = A->L->bindex[j];
307 t0 += A->L->value[j*4+0] * x[jj*2+0];
308 t1 += A->L->value[j*4+1] * x[jj*2+0];
309 t0 += A->L->value[j*4+2] * x[jj*2+1];
310 t1 += A->L->value[j*4+3] * x[jj*2+1];
311 }
312 js = A->U->bptr[i];
313 je = A->U->bptr[i+1];
314 for(j=js;j<je;j++)
315 {
316 jj = A->U->bindex[j];
317 t0 += A->U->value[j*4+0] * x[jj*2+0];
318 t1 += A->U->value[j*4+1] * x[jj*2+0];
319 t0 += A->U->value[j*4+2] * x[jj*2+1];
320 t1 += A->U->value[j*4+3] * x[jj*2+1];
321 }
322 y[2*i+0] = t0;
323 y[2*i+1] = t1;
324 }
325 }
326 else
327 {
328 #ifdef _OPENMP
329 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
330 #endif
331 for(i=0; i<nr; i++)
332 {
333 js = A->bptr[i];
334 je = A->bptr[i+1];
335 t0 = 0.0;
336 t1 = 0.0;
337 for(j=js;j<je;j++)
338 {
339 jj = A->bindex[j];
340 t0 += A->value[j*4+0] * x[jj*2+0];
341 t1 += A->value[j*4+1] * x[jj*2+0];
342 t0 += A->value[j*4+2] * x[jj*2+1];
343 t1 += A->value[j*4+3] * x[jj*2+1];
344 }
345 y[2*i+0] = t0;
346 y[2*i+1] = t1;
347 }
348 }
349 }
350
lis_matvec_bsr_2x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])351 void lis_matvec_bsr_2x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
352 {
353 LIS_INT i,j,js,je,jj;
354 LIS_INT nr;
355 LIS_SCALAR t0,t1;
356
357 nr = A->nr;
358
359 #ifdef _OPENMP
360 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
361 #endif
362 for(i=0; i<nr; i++)
363 {
364 js = A->bptr[i];
365 je = A->bptr[i+1];
366 t0 = 0.0;
367 t1 = 0.0;
368 for(j=js;j<je;j++)
369 {
370 jj = A->bindex[j];
371 t0 += A->value[j*2+0] * x[jj];
372 t1 += A->value[j*2+1] * x[jj];
373 }
374 y[2*i+0] = t0;
375 y[2*i+1] = t1;
376 }
377 }
378
lis_matvec_bsr_2x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])379 void lis_matvec_bsr_2x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
380 {
381 LIS_INT i,j,js,je,jj;
382 LIS_INT nr;
383 LIS_SCALAR t0,t1;
384
385 nr = A->nr;
386
387 #ifdef _OPENMP
388 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
389 #endif
390 for(i=0; i<nr; i++)
391 {
392 js = A->bptr[i];
393 je = A->bptr[i+1];
394 t0 = 0.0;
395 t1 = 0.0;
396 for(j=js;j<je;j++)
397 {
398 jj = A->bindex[j];
399 t0 += A->value[j*6+0] * x[jj*3+0];
400 t1 += A->value[j*6+1] * x[jj*3+0];
401 t0 += A->value[j*6+2] * x[jj*3+1];
402 t1 += A->value[j*6+3] * x[jj*3+1];
403 t0 += A->value[j*6+4] * x[jj*3+2];
404 t1 += A->value[j*6+5] * x[jj*3+2];
405 }
406 y[2*i+0] = t0;
407 y[2*i+1] = t1;
408 }
409 }
410
lis_matvec_bsr_2x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])411 void lis_matvec_bsr_2x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
412 {
413 LIS_INT i,j,js,je,jj;
414 LIS_INT nr;
415 LIS_SCALAR t0,t1;
416
417 nr = A->nr;
418
419 #ifdef _OPENMP
420 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
421 #endif
422 for(i=0; i<nr; i++)
423 {
424 js = A->bptr[i];
425 je = A->bptr[i+1];
426 t0 = 0.0;
427 t1 = 0.0;
428 for(j=js;j<je;j++)
429 {
430 jj = A->bindex[j];
431 t0 += A->value[j*8+0] * x[jj*4+0];
432 t1 += A->value[j*8+1] * x[jj*4+0];
433 t0 += A->value[j*8+2] * x[jj*4+1];
434 t1 += A->value[j*8+3] * x[jj*4+1];
435 t0 += A->value[j*8+4] * x[jj*4+2];
436 t1 += A->value[j*8+5] * x[jj*4+2];
437 t0 += A->value[j*8+6] * x[jj*4+3];
438 t1 += A->value[j*8+7] * x[jj*4+3];
439 }
440 y[2*i+0] = t0;
441 y[2*i+1] = t1;
442 }
443 }
444
lis_matvec_bsr_3x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])445 void lis_matvec_bsr_3x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
446 {
447 LIS_INT i,j,js,je,jj;
448 LIS_INT nr;
449 LIS_SCALAR t0,t1,t2;
450
451 nr = A->nr;
452
453 if( A->is_splited )
454 {
455 #ifdef _OPENMP
456 #pragma omp parallel for private(i,j,jj,js,je,t0,t1)
457 #endif
458 for(i=0; i<nr; i++)
459 {
460 t0 = A->D->value[9*i+0] * x[3*i+0] + A->D->value[9*i+3] * x[3*i+1] + A->D->value[9*i+6] * x[3*i+2];
461 t1 = A->D->value[9*i+1] * x[3*i+0] + A->D->value[9*i+4] * x[3*i+1] + A->D->value[9*i+7] * x[3*i+2];
462 t2 = A->D->value[9*i+2] * x[3*i+0] + A->D->value[9*i+5] * x[3*i+1] + A->D->value[9*i+8] * x[3*i+2];
463 js = A->L->bptr[i];
464 je = A->L->bptr[i+1];
465 for(j=js;j<je;j++)
466 {
467 jj = A->L->bindex[j];
468 t0 += A->L->value[j*9+0] * x[jj*3+0];
469 t1 += A->L->value[j*9+1] * x[jj*3+0];
470 t2 += A->L->value[j*9+2] * x[jj*3+0];
471 t0 += A->L->value[j*9+3] * x[jj*3+1];
472 t1 += A->L->value[j*9+4] * x[jj*3+1];
473 t2 += A->L->value[j*9+5] * x[jj*3+1];
474 t0 += A->L->value[j*9+6] * x[jj*3+2];
475 t1 += A->L->value[j*9+7] * x[jj*3+2];
476 t2 += A->L->value[j*9+8] * x[jj*3+2];
477 }
478 js = A->U->bptr[i];
479 je = A->U->bptr[i+1];
480 for(j=js;j<je;j++)
481 {
482 jj = A->U->bindex[j];
483 t0 += A->U->value[j*9+0] * x[jj*3+0];
484 t1 += A->U->value[j*9+1] * x[jj*3+0];
485 t2 += A->U->value[j*9+2] * x[jj*3+0];
486 t0 += A->U->value[j*9+3] * x[jj*3+1];
487 t1 += A->U->value[j*9+4] * x[jj*3+1];
488 t2 += A->U->value[j*9+5] * x[jj*3+1];
489 t0 += A->U->value[j*9+6] * x[jj*3+2];
490 t1 += A->U->value[j*9+7] * x[jj*3+2];
491 t2 += A->U->value[j*9+8] * x[jj*3+2];
492 }
493 y[3*i+0] = t0;
494 y[3*i+1] = t1;
495 y[3*i+2] = t2;
496 }
497 }
498 else
499 {
500 #ifdef _OPENMP
501 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
502 #endif
503 for(i=0; i<nr; i++)
504 {
505 js = A->bptr[i];
506 je = A->bptr[i+1];
507 t0 = 0.0;
508 t1 = 0.0;
509 t2 = 0.0;
510 for(j=js;j<je;j++)
511 {
512 jj = A->bindex[j];
513 t0 += A->value[j*9+0] * x[jj*3+0];
514 t1 += A->value[j*9+1] * x[jj*3+0];
515 t2 += A->value[j*9+2] * x[jj*3+0];
516 t0 += A->value[j*9+3] * x[jj*3+1];
517 t1 += A->value[j*9+4] * x[jj*3+1];
518 t2 += A->value[j*9+5] * x[jj*3+1];
519 t0 += A->value[j*9+6] * x[jj*3+2];
520 t1 += A->value[j*9+7] * x[jj*3+2];
521 t2 += A->value[j*9+8] * x[jj*3+2];
522 }
523 y[3*i+0] = t0;
524 y[3*i+1] = t1;
525 y[3*i+2] = t2;
526 }
527 }
528 }
529
lis_matvec_bsr_3x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])530 void lis_matvec_bsr_3x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
531 {
532 LIS_INT i,j,js,je,jj,ii;
533 LIS_INT nr;
534 LIS_SCALAR t0,t1,t2;
535
536 nr = A->nr;
537
538 #ifdef _OPENMP
539 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,ii)
540 #endif
541 for(i=0; i<nr; i++)
542 {
543 ii = 3*i;
544 js = A->bptr[i];
545 je = A->bptr[i+1];
546 t0 = 0.0;
547 t1 = 0.0;
548 t2 = 0.0;
549 for(j=js;j<je;j++)
550 {
551 jj = A->bindex[j];
552 t0 += A->value[j*3+0] * x[jj];
553 t1 += A->value[j*3+1] * x[jj];
554 t2 += A->value[j*3+2] * x[jj];
555 }
556 y[ii+0] = t0;
557 y[ii+1] = t1;
558 y[ii+2] = t2;
559 }
560 }
561
lis_matvec_bsr_3x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])562 void lis_matvec_bsr_3x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
563 {
564 LIS_INT i,j,js,je,jj;
565 LIS_INT nr;
566 LIS_SCALAR t0,t1,t2;
567
568 nr = A->nr;
569
570 #ifdef _OPENMP
571 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
572 #endif
573 for(i=0; i<nr; i++)
574 {
575 js = A->bptr[i];
576 je = A->bptr[i+1];
577 t0 = 0.0;
578 t1 = 0.0;
579 t2 = 0.0;
580 for(j=js;j<je;j++)
581 {
582 jj = A->bindex[j];
583 t0 += A->value[j*6+0] * x[jj*2+0];
584 t1 += A->value[j*6+1] * x[jj*2+0];
585 t2 += A->value[j*6+2] * x[jj*2+0];
586 t0 += A->value[j*6+3] * x[jj*2+1];
587 t1 += A->value[j*6+4] * x[jj*2+1];
588 t2 += A->value[j*6+5] * x[jj*2+1];
589 }
590 y[3*i+0] = t0;
591 y[3*i+1] = t1;
592 y[3*i+2] = t2;
593 }
594 }
595
lis_matvec_bsr_3x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])596 void lis_matvec_bsr_3x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
597 {
598 LIS_INT i,j,js,je,jj;
599 LIS_INT nr;
600 LIS_SCALAR t0,t1,t2;
601
602 nr = A->nr;
603
604 #ifdef _OPENMP
605 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
606 #endif
607 for(i=0; i<nr; i++)
608 {
609 js = A->bptr[i];
610 je = A->bptr[i+1];
611 t0 = 0.0;
612 t1 = 0.0;
613 t2 = 0.0;
614 for(j=js;j<je;j++)
615 {
616 jj = A->bindex[j];
617 t0 += A->value[j*12+ 0] * x[jj*4+0];
618 t1 += A->value[j*12+ 1] * x[jj*4+0];
619 t2 += A->value[j*12+ 2] * x[jj*4+0];
620 t0 += A->value[j*12+ 3] * x[jj*4+1];
621 t1 += A->value[j*12+ 4] * x[jj*4+1];
622 t2 += A->value[j*12+ 5] * x[jj*4+1];
623 t0 += A->value[j*12+ 6] * x[jj*4+2];
624 t1 += A->value[j*12+ 7] * x[jj*4+2];
625 t2 += A->value[j*12+ 8] * x[jj*4+2];
626 t0 += A->value[j*12+ 9] * x[jj*4+3];
627 t1 += A->value[j*12+10] * x[jj*4+3];
628 t2 += A->value[j*12+11] * x[jj*4+3];
629 }
630 y[3*i+0] = t0;
631 y[3*i+1] = t1;
632 y[3*i+2] = t2;
633 }
634 }
635
lis_matvec_bsr_4x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])636 void lis_matvec_bsr_4x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
637 {
638 LIS_INT i,j,js,je,jj;
639 LIS_INT nr;
640 LIS_SCALAR t0,t1,t2,t3;
641
642 nr = A->nr;
643
644 if( A->is_splited )
645 {
646 #ifdef _OPENMP
647 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
648 #endif
649 for(i=0; i<nr; i++)
650 {
651 t0 = A->D->value[16*i+0] * x[4*i+0] + A->D->value[16*i+4] * x[4*i+1] + A->D->value[16*i+8] * x[4*i+2] + A->D->value[16*i+12] * x[4*i+3];
652 t1 = A->D->value[16*i+1] * x[4*i+0] + A->D->value[16*i+5] * x[4*i+1] + A->D->value[16*i+9] * x[4*i+2] + A->D->value[16*i+13] * x[4*i+3];
653 t2 = A->D->value[16*i+2] * x[4*i+0] + A->D->value[16*i+6] * x[4*i+1] + A->D->value[16*i+10] * x[4*i+2] + A->D->value[16*i+14] * x[4*i+3];
654 t3 = A->D->value[16*i+3] * x[4*i+0] + A->D->value[16*i+7] * x[4*i+1] + A->D->value[16*i+11] * x[4*i+2] + A->D->value[16*i+15] * x[4*i+3];
655 js = A->L->bptr[i];
656 je = A->L->bptr[i+1];
657 for(j=js;j<je;j++)
658 {
659 jj = A->L->bindex[j];
660 t0 += A->L->value[j*16+ 0] * x[jj*4+0];
661 t1 += A->L->value[j*16+ 1] * x[jj*4+0];
662 t2 += A->L->value[j*16+ 2] * x[jj*4+0];
663 t3 += A->L->value[j*16+ 3] * x[jj*4+0];
664 t0 += A->L->value[j*16+ 4] * x[jj*4+1];
665 t1 += A->L->value[j*16+ 5] * x[jj*4+1];
666 t2 += A->L->value[j*16+ 6] * x[jj*4+1];
667 t3 += A->L->value[j*16+ 7] * x[jj*4+1];
668 t0 += A->L->value[j*16+ 8] * x[jj*4+2];
669 t1 += A->L->value[j*16+ 9] * x[jj*4+2];
670 t2 += A->L->value[j*16+10] * x[jj*4+2];
671 t3 += A->L->value[j*16+11] * x[jj*4+2];
672 t0 += A->L->value[j*16+12] * x[jj*4+3];
673 t1 += A->L->value[j*16+13] * x[jj*4+3];
674 t2 += A->L->value[j*16+14] * x[jj*4+3];
675 t3 += A->L->value[j*16+15] * x[jj*4+3];
676 }
677 js = A->U->bptr[i];
678 je = A->U->bptr[i+1];
679 for(j=js;j<je;j++)
680 {
681 jj = A->U->bindex[j];
682 t0 += A->U->value[j*16+ 0] * x[jj*4+0];
683 t1 += A->U->value[j*16+ 1] * x[jj*4+0];
684 t2 += A->U->value[j*16+ 2] * x[jj*4+0];
685 t3 += A->U->value[j*16+ 3] * x[jj*4+0];
686 t0 += A->U->value[j*16+ 4] * x[jj*4+1];
687 t1 += A->U->value[j*16+ 5] * x[jj*4+1];
688 t2 += A->U->value[j*16+ 6] * x[jj*4+1];
689 t3 += A->U->value[j*16+ 7] * x[jj*4+1];
690 t0 += A->U->value[j*16+ 8] * x[jj*4+2];
691 t1 += A->U->value[j*16+ 9] * x[jj*4+2];
692 t2 += A->U->value[j*16+10] * x[jj*4+2];
693 t3 += A->U->value[j*16+11] * x[jj*4+2];
694 t0 += A->U->value[j*16+12] * x[jj*4+3];
695 t1 += A->U->value[j*16+13] * x[jj*4+3];
696 t2 += A->U->value[j*16+14] * x[jj*4+3];
697 t3 += A->U->value[j*16+15] * x[jj*4+3];
698 }
699 y[4*i+0] = t0;
700 y[4*i+1] = t1;
701 y[4*i+2] = t2;
702 y[4*i+3] = t3;
703 }
704 }
705 else
706 {
707 #ifdef _OPENMP
708 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
709 #endif
710 for(i=0; i<nr; i++)
711 {
712 js = A->bptr[i];
713 je = A->bptr[i+1];
714 t0 = 0.0;
715 t1 = 0.0;
716 t2 = 0.0;
717 t3 = 0.0;
718 for(j=js;j<je;j++)
719 {
720 jj = A->bindex[j];
721 t0 += A->value[j*16+ 0] * x[jj*4+0];
722 t1 += A->value[j*16+ 1] * x[jj*4+0];
723 t2 += A->value[j*16+ 2] * x[jj*4+0];
724 t3 += A->value[j*16+ 3] * x[jj*4+0];
725 t0 += A->value[j*16+ 4] * x[jj*4+1];
726 t1 += A->value[j*16+ 5] * x[jj*4+1];
727 t2 += A->value[j*16+ 6] * x[jj*4+1];
728 t3 += A->value[j*16+ 7] * x[jj*4+1];
729 t0 += A->value[j*16+ 8] * x[jj*4+2];
730 t1 += A->value[j*16+ 9] * x[jj*4+2];
731 t2 += A->value[j*16+10] * x[jj*4+2];
732 t3 += A->value[j*16+11] * x[jj*4+2];
733 t0 += A->value[j*16+12] * x[jj*4+3];
734 t1 += A->value[j*16+13] * x[jj*4+3];
735 t2 += A->value[j*16+14] * x[jj*4+3];
736 t3 += A->value[j*16+15] * x[jj*4+3];
737 }
738 y[4*i+0] = t0;
739 y[4*i+1] = t1;
740 y[4*i+2] = t2;
741 y[4*i+3] = t3;
742 }
743 }
744 }
745
lis_matvec_bsr_4x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])746 void lis_matvec_bsr_4x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
747 {
748 LIS_INT i,j,js,je,jj;
749 LIS_INT nr;
750 LIS_SCALAR t0,t1,t2,t3;
751
752 nr = A->nr;
753
754 #ifdef _OPENMP
755 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
756 #endif
757 for(i=0; i<nr; i++)
758 {
759 js = A->bptr[i];
760 je = A->bptr[i+1];
761 t0 = 0.0;
762 t1 = 0.0;
763 t2 = 0.0;
764 t3 = 0.0;
765 for(j=js;j<je;j++)
766 {
767 jj = A->bindex[j];
768 t0 += A->value[j*4+ 0] * x[jj];
769 t1 += A->value[j*4+ 1] * x[jj];
770 t2 += A->value[j*4+ 2] * x[jj];
771 t3 += A->value[j*4+ 3] * x[jj];
772 }
773 y[4*i+0] = t0;
774 y[4*i+1] = t1;
775 y[4*i+2] = t2;
776 y[4*i+3] = t3;
777 }
778 }
779
lis_matvec_bsr_4x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])780 void lis_matvec_bsr_4x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
781 {
782 LIS_INT i,j,js,je,jj;
783 LIS_INT nr;
784 LIS_SCALAR t0,t1,t2,t3;
785
786 nr = A->nr;
787
788 #ifdef _OPENMP
789 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
790 #endif
791 for(i=0; i<nr; i++)
792 {
793 js = A->bptr[i];
794 je = A->bptr[i+1];
795 t0 = 0.0;
796 t1 = 0.0;
797 t2 = 0.0;
798 t3 = 0.0;
799 for(j=js;j<je;j++)
800 {
801 jj = A->bindex[j];
802 t0 += A->value[j*8+ 0] * x[jj*2+0];
803 t1 += A->value[j*8+ 1] * x[jj*2+0];
804 t2 += A->value[j*8+ 2] * x[jj*2+0];
805 t3 += A->value[j*8+ 3] * x[jj*2+0];
806 t0 += A->value[j*8+ 4] * x[jj*2+1];
807 t1 += A->value[j*8+ 5] * x[jj*2+1];
808 t2 += A->value[j*8+ 6] * x[jj*2+1];
809 t3 += A->value[j*8+ 7] * x[jj*2+1];
810 }
811 y[4*i+0] = t0;
812 y[4*i+1] = t1;
813 y[4*i+2] = t2;
814 y[4*i+3] = t3;
815 }
816 }
817
lis_matvec_bsr_4x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])818 void lis_matvec_bsr_4x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
819 {
820 LIS_INT i,j,js,je,jj;
821 LIS_INT nr;
822 LIS_SCALAR t0,t1,t2,t3;
823
824 nr = A->nr;
825
826 #ifdef _OPENMP
827 #pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
828 #endif
829 for(i=0; i<nr; i++)
830 {
831 js = A->bptr[i];
832 je = A->bptr[i+1];
833 t0 = 0.0;
834 t1 = 0.0;
835 t2 = 0.0;
836 t3 = 0.0;
837 for(j=js;j<je;j++)
838 {
839 jj = A->bindex[j];
840 t0 += A->value[j*12+ 0] * x[jj*3+0];
841 t1 += A->value[j*12+ 1] * x[jj*3+0];
842 t2 += A->value[j*12+ 2] * x[jj*3+0];
843 t3 += A->value[j*12+ 3] * x[jj*3+0];
844 t0 += A->value[j*12+ 4] * x[jj*3+1];
845 t1 += A->value[j*12+ 5] * x[jj*3+1];
846 t2 += A->value[j*12+ 6] * x[jj*3+1];
847 t3 += A->value[j*12+ 7] * x[jj*3+1];
848 t0 += A->value[j*12+ 8] * x[jj*3+2];
849 t1 += A->value[j*12+ 9] * x[jj*3+2];
850 t2 += A->value[j*12+10] * x[jj*3+2];
851 t3 += A->value[j*12+11] * x[jj*3+2];
852 }
853 y[4*i+0] = t0;
854 y[4*i+1] = t1;
855 y[4*i+2] = t2;
856 y[4*i+3] = t3;
857 }
858 }
859
lis_matvech_bsr(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])860 void lis_matvech_bsr(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
861 {
862 LIS_INT i,j,k;
863 LIS_INT bi,bj,bc,bs;
864 LIS_INT nr,bnr,bnc;
865 LIS_INT n,np;
866 #ifdef _OPENMP
867 LIS_INT nprocs,my_rank;
868 LIS_SCALAR t;
869 LIS_SCALAR *w;
870 #endif
871
872 n = A->n;
873 np = A->np;
874 nr = A->nr;
875 bnr = A->bnr;
876 bnc = A->bnc;
877 bs = bnr*bnc;
878 if( A->is_splited )
879 {
880 for(i=0;i<n;i++)
881 {
882 y[i] = 0.0;
883 }
884 for(bi=0;bi<nr;bi++)
885 {
886 k = bi*bs;
887 for(j=0;j<bnc;j++)
888 {
889 for(i=0;i<bnr;i++)
890 {
891 y[bi*bnr+j] += conj(A->D->value[k++]) * x[bi*bnr+i];
892 }
893 }
894 }
895 for(bi=0;bi<nr;bi++)
896 {
897 for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
898 {
899 bj = A->L->bindex[bc] * bnc;
900 k = bc*bs;
901 for(j=0;j<bnc;j++)
902 {
903 for(i=0;i<bnr;i++)
904 {
905 y[bj+j] += conj(A->L->value[k]) * x[bi*bnr+i];
906 k++;
907 }
908 }
909 }
910 for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
911 {
912 bj = A->U->bindex[bc] * bnc;
913 k = bc*bs;
914 for(j=0;j<bnc;j++)
915 {
916 for(i=0;i<bnr;i++)
917 {
918 y[bj+j] += conj(A->U->value[k]) * x[bi*bnr+i];
919 k++;
920 }
921 }
922 }
923 }
924 }
925 else
926 {
927 #ifdef _OPENMP
928 nprocs = omp_get_max_threads();
929 w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_bsr::w" );
930
931 #pragma omp parallel private(bi,bc,bj,i,j,k,my_rank)
932 {
933 my_rank = omp_get_thread_num();
934
935 #pragma omp for
936 for(j=0;j<nprocs;j++)
937 {
938 memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
939 }
940 #pragma omp for
941 for(bi=0;bi<nr;bi++)
942 {
943 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
944 {
945 bj = my_rank*np + A->bindex[bc] * bnc;
946 k = bc*bs;
947 for(j=0;j<bnc;j++)
948 {
949 for(i=0;i<bnr;i++)
950 {
951 w[bj+j] += conj(A->value[k]) * x[bi*bnr+i];
952 k++;
953 }
954 }
955 }
956 }
957 #pragma omp barrier
958 #pragma omp for
959 for(i=0;i<np;i++)
960 {
961 t = 0.0;
962 for(j=0;j<nprocs;j++)
963 {
964 t += w[j*np+i];
965 }
966 y[i] = t;
967 }
968 }
969 lis_free(w);
970 #else
971 for(i=0; i<n; i++)
972 {
973 y[i] = 0.0;
974 }
975 for(bi=0;bi<nr;bi++)
976 {
977 for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
978 {
979 bj = A->bindex[bc] * bnc;
980 k = bc*bs;
981 for(j=0;j<bnc;j++)
982 {
983 for(i=0;i<bnr;i++)
984 {
985 y[bj+j] += conj(A->value[k]) * x[bi*bnr+i];
986 k++;
987 }
988 }
989 }
990 }
991 #endif
992 }
993 }
994