1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2 
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions are met:
5    1. Redistributions of source code must retain the above copyright
6       notice, this list of conditions and the following disclaimer.
7    2. Redistributions in binary form must reproduce the above copyright
8       notice, this list of conditions and the following disclaimer in the
9       documentation and/or other materials provided with the distribution.
10    3. Neither the name of the project nor the names of its contributors
11       may be used to endorse or promote products derived from this software
12       without specific prior written permission.
13 
14    THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18    PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24    POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 #ifdef HAVE_CONFIG_H
28 	#include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 	#include "lis_config_win.h"
32 #endif
33 #endif
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38         #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <stdarg.h>
42 #ifdef _OPENMP
43 	#include <omp.h>
44 #endif
45 #ifdef USE_MPI
46 	#include <mpi.h>
47 #endif
48 #include "lislib.h"
49 
50 LIS_MATVEC_XXX lis_matvec_bsr_xxx[4][4] = {
51 	{lis_matvec_bsr_1x1, lis_matvec_bsr_1x2, lis_matvec_bsr_1x3, lis_matvec_bsr_1x4},
52 	{lis_matvec_bsr_2x1, lis_matvec_bsr_2x2, lis_matvec_bsr_2x3, lis_matvec_bsr_2x4},
53 	{lis_matvec_bsr_3x1, lis_matvec_bsr_3x2, lis_matvec_bsr_3x3, lis_matvec_bsr_3x4},
54 	{lis_matvec_bsr_4x1, lis_matvec_bsr_4x2, lis_matvec_bsr_4x3, lis_matvec_bsr_4x4}
55 };
56 
lis_matvec_bsr(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])57 void lis_matvec_bsr(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
58 {
59 	LIS_INT i,j,k;
60 	LIS_INT bi,bj,bc,bs;
61 	LIS_INT nr,bnr,bnc;
62 	LIS_INT n;
63 
64 	n   = A->n;
65 	nr  = A->nr;
66 	bnr = A->bnr;
67 	bnc = A->bnc;
68 	bs  = bnr*bnc;
69 
70 	if( A->is_splited )
71 	{
72 		#ifdef _OPENMP
73 		#pragma omp parallel for private(i)
74 		#endif
75 		for(i=0; i<n; i++)
76 		{
77 			y[i] = 0.0;
78 		}
79 		#ifdef _OPENMP
80 		#pragma omp parallel for private(i,j,k,bi,bj,bc)
81 		#endif
82 		for(bi=0;bi<nr;bi++)
83 		{
84 			k = bi*bs;
85 			for(j=0;j<bnc;j++)
86 			{
87 				for(i=0;i<bnr;i++)
88 				{
89 					y[bi*bnr+i] += A->D->value[k] * x[bi*bnr+j];
90 					k++;
91 				}
92 			}
93 			for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
94 			{
95 				bj   = A->L->bindex[bc] * bnc;
96 				k    = bc*bs;
97 				for(j=0;j<bnc;j++)
98 				{
99 					for(i=0;i<bnr;i++)
100 					{
101 						y[bi*bnr+i] += A->L->value[k] * x[bj+j];
102 						k++;
103 					}
104 				}
105 			}
106 			for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
107 			{
108 				bj   = A->U->bindex[bc] * bnc;
109 				k    = bc*bs;
110 				for(j=0;j<bnc;j++)
111 				{
112 					for(i=0;i<bnr;i++)
113 					{
114 						y[bi*bnr+i] += A->U->value[k] * x[bj+j];
115 						k++;
116 					}
117 				}
118 			}
119 		}
120 	}
121 	else
122 	{
123 		#ifdef _OPENMP
124 		#pragma omp parallel for private(i)
125 		#endif
126 		for(i=0; i<n; i++)
127 		{
128 			y[i] = 0.0;
129 		}
130 		#ifdef _OPENMP
131 		#pragma omp parallel for private(i,j,k,bi,bj,bc)
132 		#endif
133 		for(bi=0;bi<nr;bi++)
134 		{
135 			for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
136 			{
137 				bj   = A->bindex[bc] * bnc;
138 				k    = bc*bs;
139 				for(j=0;j<bnc;j++)
140 				{
141 					for(i=0;i<bnr;i++)
142 					{
143 						y[bi*bnr+i] += A->value[k] * x[bj+j];
144 						k++;
145 					}
146 				}
147 			}
148 		}
149 	}
150 }
151 
lis_matvec_bsr_1x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])152 void lis_matvec_bsr_1x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
153 {
154 	LIS_INT i,j,js,je,jj;
155 	LIS_INT nr;
156 	LIS_SCALAR t0;
157 
158 	nr   = A->nr;
159 	if( A->is_splited )
160 	{
161 		#ifdef _OPENMP
162 		#pragma omp parallel for private(i,j,jj,js,je,t0)
163 		#endif
164 		for(i=0; i<nr; i++)
165 		{
166 			t0 = A->D->value[i] * x[i];
167 			js = A->L->bptr[i];
168 			je = A->L->bptr[i+1];
169 			for(j=js;j<je;j++)
170 			{
171 				jj  = A->L->bindex[j];
172 				t0 += A->L->value[j] * x[jj];
173 			}
174 			js = A->U->bptr[i];
175 			je = A->U->bptr[i+1];
176 			for(j=js;j<je;j++)
177 			{
178 				jj  = A->U->bindex[j];
179 				t0 += A->U->value[j] * x[jj];
180 			}
181 			y[i] = t0;
182 		}
183 	}
184 	else
185 	{
186 		#ifdef _OPENMP
187 		#pragma omp parallel for private(i,j,jj,js,je,t0)
188 		#endif
189 		for(i=0; i<nr; i++)
190 		{
191 			js = A->bptr[i];
192 			je = A->bptr[i+1];
193 			t0 = 0.0;
194 			for(j=js;j<je;j++)
195 			{
196 				jj  = A->bindex[j];
197 				t0 += A->value[j] * x[jj];
198 			}
199 			y[i] = t0;
200 		}
201 	}
202 }
203 
lis_matvec_bsr_1x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])204 void lis_matvec_bsr_1x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
205 {
206 	LIS_INT i,j,js,je,jj;
207 	LIS_INT nr;
208 	LIS_SCALAR t0;
209 
210 	nr   = A->nr;
211 
212 	#ifdef _OPENMP
213 	#pragma omp parallel for private(i,j,jj,js,je,t0)
214 	#endif
215 	for(i=0; i<nr; i++)
216 	{
217 		js = A->bptr[i];
218 		je = A->bptr[i+1];
219 		t0 = 0.0;
220 		for(j=js;j<je;j++)
221 		{
222 			jj  = A->bindex[j];
223 			t0 += A->value[j*2+0] * x[jj*2+0];
224 			t0 += A->value[j*2+1] * x[jj*2+1];
225 		}
226 		y[i] = t0;
227 	}
228 }
229 
lis_matvec_bsr_1x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])230 void lis_matvec_bsr_1x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
231 {
232 	LIS_INT i,j,js,je,jj;
233 	LIS_INT nr;
234 	LIS_SCALAR t0;
235 
236 	nr   = A->nr;
237 
238 	#ifdef _OPENMP
239 	#pragma omp parallel for private(i,j,jj,js,je,t0)
240 	#endif
241 	for(i=0; i<nr; i++)
242 	{
243 		js = A->bptr[i];
244 		je = A->bptr[i+1];
245 		t0 = 0.0;
246 		for(j=js;j<je;j++)
247 		{
248 			jj  = A->bindex[j];
249 			t0 += A->value[j*3+0] * x[jj*3+0];
250 			t0 += A->value[j*3+1] * x[jj*3+1];
251 			t0 += A->value[j*3+2] * x[jj*3+2];
252 		}
253 		y[i] = t0;
254 	}
255 }
256 
lis_matvec_bsr_1x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])257 void lis_matvec_bsr_1x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
258 {
259 	LIS_INT i,j,js,je,jj;
260 	LIS_INT nr;
261 	LIS_SCALAR t0;
262 
263 	nr   = A->nr;
264 
265 	#ifdef _OPENMP
266 	#pragma omp parallel for private(i,j,jj,js,je,t0)
267 	#endif
268 	for(i=0; i<nr; i++)
269 	{
270 		js = A->bptr[i];
271 		je = A->bptr[i+1];
272 		t0 = 0.0;
273 		for(j=js;j<je;j++)
274 		{
275 			jj  = A->bindex[j];
276 			t0 += A->value[j*4+0] * x[jj*4+0];
277 			t0 += A->value[j*4+1] * x[jj*4+1];
278 			t0 += A->value[j*4+2] * x[jj*4+2];
279 			t0 += A->value[j*4+3] * x[jj*4+3];
280 		}
281 		y[i] = t0;
282 	}
283 }
284 
lis_matvec_bsr_2x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])285 void lis_matvec_bsr_2x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
286 {
287 	LIS_INT i,j,js,je,jj;
288 	LIS_INT nr;
289 	LIS_SCALAR t0,t1;
290 
291 	nr   = A->nr;
292 
293 	if( A->is_splited )
294 	{
295 		#ifdef _OPENMP
296 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
297 		#endif
298 		for(i=0; i<nr; i++)
299 		{
300 			t0 = A->D->value[4*i+0] * x[2*i+0] + A->D->value[4*i+2] * x[2*i+1];
301 			t1 = A->D->value[4*i+1] * x[2*i+0] + A->D->value[4*i+3] * x[2*i+1];
302 			js = A->L->bptr[i];
303 			je = A->L->bptr[i+1];
304 			for(j=js;j<je;j++)
305 			{
306 				jj  = A->L->bindex[j];
307 				t0 += A->L->value[j*4+0] * x[jj*2+0];
308 				t1 += A->L->value[j*4+1] * x[jj*2+0];
309 				t0 += A->L->value[j*4+2] * x[jj*2+1];
310 				t1 += A->L->value[j*4+3] * x[jj*2+1];
311 			}
312 			js = A->U->bptr[i];
313 			je = A->U->bptr[i+1];
314 			for(j=js;j<je;j++)
315 			{
316 				jj  = A->U->bindex[j];
317 				t0 += A->U->value[j*4+0] * x[jj*2+0];
318 				t1 += A->U->value[j*4+1] * x[jj*2+0];
319 				t0 += A->U->value[j*4+2] * x[jj*2+1];
320 				t1 += A->U->value[j*4+3] * x[jj*2+1];
321 			}
322 			y[2*i+0] = t0;
323 			y[2*i+1] = t1;
324 		}
325 	}
326 	else
327 	{
328 		#ifdef _OPENMP
329 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
330 		#endif
331 		for(i=0; i<nr; i++)
332 		{
333 			js = A->bptr[i];
334 			je = A->bptr[i+1];
335 			t0 = 0.0;
336 			t1 = 0.0;
337 			for(j=js;j<je;j++)
338 			{
339 				jj  = A->bindex[j];
340 				t0 += A->value[j*4+0] * x[jj*2+0];
341 				t1 += A->value[j*4+1] * x[jj*2+0];
342 				t0 += A->value[j*4+2] * x[jj*2+1];
343 				t1 += A->value[j*4+3] * x[jj*2+1];
344 			}
345 			y[2*i+0] = t0;
346 			y[2*i+1] = t1;
347 		}
348 	}
349 }
350 
lis_matvec_bsr_2x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])351 void lis_matvec_bsr_2x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
352 {
353 	LIS_INT i,j,js,je,jj;
354 	LIS_INT nr;
355 	LIS_SCALAR t0,t1;
356 
357 	nr   = A->nr;
358 
359 	#ifdef _OPENMP
360 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
361 	#endif
362 	for(i=0; i<nr; i++)
363 	{
364 		js = A->bptr[i];
365 		je = A->bptr[i+1];
366 		t0 = 0.0;
367 		t1 = 0.0;
368 		for(j=js;j<je;j++)
369 		{
370 			jj  = A->bindex[j];
371 			t0 += A->value[j*2+0] * x[jj];
372 			t1 += A->value[j*2+1] * x[jj];
373 		}
374 		y[2*i+0] = t0;
375 		y[2*i+1] = t1;
376 	}
377 }
378 
lis_matvec_bsr_2x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])379 void lis_matvec_bsr_2x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
380 {
381 	LIS_INT i,j,js,je,jj;
382 	LIS_INT nr;
383 	LIS_SCALAR t0,t1;
384 
385 	nr   = A->nr;
386 
387 	#ifdef _OPENMP
388 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
389 	#endif
390 	for(i=0; i<nr; i++)
391 	{
392 		js = A->bptr[i];
393 		je = A->bptr[i+1];
394 		t0 = 0.0;
395 		t1 = 0.0;
396 		for(j=js;j<je;j++)
397 		{
398 			jj  = A->bindex[j];
399 			t0 += A->value[j*6+0] * x[jj*3+0];
400 			t1 += A->value[j*6+1] * x[jj*3+0];
401 			t0 += A->value[j*6+2] * x[jj*3+1];
402 			t1 += A->value[j*6+3] * x[jj*3+1];
403 			t0 += A->value[j*6+4] * x[jj*3+2];
404 			t1 += A->value[j*6+5] * x[jj*3+2];
405 		}
406 		y[2*i+0] = t0;
407 		y[2*i+1] = t1;
408 	}
409 }
410 
lis_matvec_bsr_2x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])411 void lis_matvec_bsr_2x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
412 {
413 	LIS_INT i,j,js,je,jj;
414 	LIS_INT nr;
415 	LIS_SCALAR t0,t1;
416 
417 	nr   = A->nr;
418 
419 	#ifdef _OPENMP
420 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
421 	#endif
422 	for(i=0; i<nr; i++)
423 	{
424 		js = A->bptr[i];
425 		je = A->bptr[i+1];
426 		t0 = 0.0;
427 		t1 = 0.0;
428 		for(j=js;j<je;j++)
429 		{
430 			jj  = A->bindex[j];
431 			t0 +=  A->value[j*8+0] * x[jj*4+0];
432 			t1 +=  A->value[j*8+1] * x[jj*4+0];
433 			t0 +=  A->value[j*8+2] * x[jj*4+1];
434 			t1 +=  A->value[j*8+3] * x[jj*4+1];
435 			t0 +=  A->value[j*8+4] * x[jj*4+2];
436 			t1 +=  A->value[j*8+5] * x[jj*4+2];
437 			t0 +=  A->value[j*8+6] * x[jj*4+3];
438 			t1 +=  A->value[j*8+7] * x[jj*4+3];
439 		}
440 		y[2*i+0] = t0;
441 		y[2*i+1] = t1;
442 	}
443 }
444 
lis_matvec_bsr_3x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])445 void lis_matvec_bsr_3x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
446 {
447 	LIS_INT i,j,js,je,jj;
448 	LIS_INT nr;
449 	LIS_SCALAR t0,t1,t2;
450 
451 	nr   = A->nr;
452 
453 	if( A->is_splited )
454 	{
455 		#ifdef _OPENMP
456 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
457 		#endif
458 		for(i=0; i<nr; i++)
459 		{
460 			t0 = A->D->value[9*i+0] * x[3*i+0] + A->D->value[9*i+3] * x[3*i+1] + A->D->value[9*i+6] * x[3*i+2];
461 			t1 = A->D->value[9*i+1] * x[3*i+0] + A->D->value[9*i+4] * x[3*i+1] + A->D->value[9*i+7] * x[3*i+2];
462 			t2 = A->D->value[9*i+2] * x[3*i+0] + A->D->value[9*i+5] * x[3*i+1] + A->D->value[9*i+8] * x[3*i+2];
463 			js = A->L->bptr[i];
464 			je = A->L->bptr[i+1];
465 			for(j=js;j<je;j++)
466 			{
467 				jj  = A->L->bindex[j];
468 				t0 +=  A->L->value[j*9+0] * x[jj*3+0];
469 				t1 +=  A->L->value[j*9+1] * x[jj*3+0];
470 				t2 +=  A->L->value[j*9+2] * x[jj*3+0];
471 				t0 +=  A->L->value[j*9+3] * x[jj*3+1];
472 				t1 +=  A->L->value[j*9+4] * x[jj*3+1];
473 				t2 +=  A->L->value[j*9+5] * x[jj*3+1];
474 				t0 +=  A->L->value[j*9+6] * x[jj*3+2];
475 				t1 +=  A->L->value[j*9+7] * x[jj*3+2];
476 				t2 +=  A->L->value[j*9+8] * x[jj*3+2];
477 			}
478 			js = A->U->bptr[i];
479 			je = A->U->bptr[i+1];
480 			for(j=js;j<je;j++)
481 			{
482 				jj  = A->U->bindex[j];
483 				t0 +=  A->U->value[j*9+0] * x[jj*3+0];
484 				t1 +=  A->U->value[j*9+1] * x[jj*3+0];
485 				t2 +=  A->U->value[j*9+2] * x[jj*3+0];
486 				t0 +=  A->U->value[j*9+3] * x[jj*3+1];
487 				t1 +=  A->U->value[j*9+4] * x[jj*3+1];
488 				t2 +=  A->U->value[j*9+5] * x[jj*3+1];
489 				t0 +=  A->U->value[j*9+6] * x[jj*3+2];
490 				t1 +=  A->U->value[j*9+7] * x[jj*3+2];
491 				t2 +=  A->U->value[j*9+8] * x[jj*3+2];
492 			}
493 			y[3*i+0] = t0;
494 			y[3*i+1] = t1;
495 			y[3*i+2] = t2;
496 		}
497 	}
498 	else
499 	{
500 		#ifdef _OPENMP
501 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
502 		#endif
503 		for(i=0; i<nr; i++)
504 		{
505 			js = A->bptr[i];
506 			je = A->bptr[i+1];
507 			t0 = 0.0;
508 			t1 = 0.0;
509 			t2 = 0.0;
510 			for(j=js;j<je;j++)
511 			{
512 				jj  = A->bindex[j];
513 				t0 +=  A->value[j*9+0] * x[jj*3+0];
514 				t1 +=  A->value[j*9+1] * x[jj*3+0];
515 				t2 +=  A->value[j*9+2] * x[jj*3+0];
516 				t0 +=  A->value[j*9+3] * x[jj*3+1];
517 				t1 +=  A->value[j*9+4] * x[jj*3+1];
518 				t2 +=  A->value[j*9+5] * x[jj*3+1];
519 				t0 +=  A->value[j*9+6] * x[jj*3+2];
520 				t1 +=  A->value[j*9+7] * x[jj*3+2];
521 				t2 +=  A->value[j*9+8] * x[jj*3+2];
522 			}
523 			y[3*i+0] = t0;
524 			y[3*i+1] = t1;
525 			y[3*i+2] = t2;
526 		}
527 	}
528 }
529 
lis_matvec_bsr_3x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])530 void lis_matvec_bsr_3x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
531 {
532 	LIS_INT i,j,js,je,jj,ii;
533 	LIS_INT nr;
534 	LIS_SCALAR t0,t1,t2;
535 
536 	nr   = A->nr;
537 
538 	#ifdef _OPENMP
539 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,ii)
540 	#endif
541 	for(i=0; i<nr; i++)
542 	{
543 		ii = 3*i;
544 		js = A->bptr[i];
545 		je = A->bptr[i+1];
546 		t0 = 0.0;
547 		t1 = 0.0;
548 		t2 = 0.0;
549 		for(j=js;j<je;j++)
550 		{
551 			jj  = A->bindex[j];
552 			t0 +=  A->value[j*3+0] * x[jj];
553 			t1 +=  A->value[j*3+1] * x[jj];
554 			t2 +=  A->value[j*3+2] * x[jj];
555 		}
556 		y[ii+0] = t0;
557 		y[ii+1] = t1;
558 		y[ii+2] = t2;
559 	}
560 }
561 
lis_matvec_bsr_3x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])562 void lis_matvec_bsr_3x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
563 {
564 	LIS_INT i,j,js,je,jj;
565 	LIS_INT nr;
566 	LIS_SCALAR t0,t1,t2;
567 
568 	nr   = A->nr;
569 
570 	#ifdef _OPENMP
571 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
572 	#endif
573 	for(i=0; i<nr; i++)
574 	{
575 		js = A->bptr[i];
576 		je = A->bptr[i+1];
577 		t0 = 0.0;
578 		t1 = 0.0;
579 		t2 = 0.0;
580 		for(j=js;j<je;j++)
581 		{
582 			jj  = A->bindex[j];
583 			t0 +=  A->value[j*6+0] * x[jj*2+0];
584 			t1 +=  A->value[j*6+1] * x[jj*2+0];
585 			t2 +=  A->value[j*6+2] * x[jj*2+0];
586 			t0 +=  A->value[j*6+3] * x[jj*2+1];
587 			t1 +=  A->value[j*6+4] * x[jj*2+1];
588 			t2 +=  A->value[j*6+5] * x[jj*2+1];
589 		}
590 		y[3*i+0] = t0;
591 		y[3*i+1] = t1;
592 		y[3*i+2] = t2;
593 	}
594 }
595 
lis_matvec_bsr_3x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])596 void lis_matvec_bsr_3x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
597 {
598 	LIS_INT i,j,js,je,jj;
599 	LIS_INT nr;
600 	LIS_SCALAR t0,t1,t2;
601 
602 	nr   = A->nr;
603 
604 	#ifdef _OPENMP
605 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
606 	#endif
607 	for(i=0; i<nr; i++)
608 	{
609 		js = A->bptr[i];
610 		je = A->bptr[i+1];
611 		t0 = 0.0;
612 		t1 = 0.0;
613 		t2 = 0.0;
614 		for(j=js;j<je;j++)
615 		{
616 			jj  = A->bindex[j];
617 			t0 +=  A->value[j*12+ 0] * x[jj*4+0];
618 			t1 +=  A->value[j*12+ 1] * x[jj*4+0];
619 			t2 +=  A->value[j*12+ 2] * x[jj*4+0];
620 			t0 +=  A->value[j*12+ 3] * x[jj*4+1];
621 			t1 +=  A->value[j*12+ 4] * x[jj*4+1];
622 			t2 +=  A->value[j*12+ 5] * x[jj*4+1];
623 			t0 +=  A->value[j*12+ 6] * x[jj*4+2];
624 			t1 +=  A->value[j*12+ 7] * x[jj*4+2];
625 			t2 +=  A->value[j*12+ 8] * x[jj*4+2];
626 			t0 +=  A->value[j*12+ 9] * x[jj*4+3];
627 			t1 +=  A->value[j*12+10] * x[jj*4+3];
628 			t2 +=  A->value[j*12+11] * x[jj*4+3];
629 		}
630 		y[3*i+0] = t0;
631 		y[3*i+1] = t1;
632 		y[3*i+2] = t2;
633 	}
634 }
635 
lis_matvec_bsr_4x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])636 void lis_matvec_bsr_4x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
637 {
638 	LIS_INT i,j,js,je,jj;
639 	LIS_INT nr;
640 	LIS_SCALAR t0,t1,t2,t3;
641 
642 	nr   = A->nr;
643 
644 	if( A->is_splited )
645 	{
646 		#ifdef _OPENMP
647 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
648 		#endif
649 		for(i=0; i<nr; i++)
650 		{
651 			t0 = A->D->value[16*i+0] * x[4*i+0] + A->D->value[16*i+4] * x[4*i+1] + A->D->value[16*i+8]  * x[4*i+2] + A->D->value[16*i+12] * x[4*i+3];
652 			t1 = A->D->value[16*i+1] * x[4*i+0] + A->D->value[16*i+5] * x[4*i+1] + A->D->value[16*i+9]  * x[4*i+2] + A->D->value[16*i+13] * x[4*i+3];
653 			t2 = A->D->value[16*i+2] * x[4*i+0] + A->D->value[16*i+6] * x[4*i+1] + A->D->value[16*i+10] * x[4*i+2] + A->D->value[16*i+14] * x[4*i+3];
654 			t3 = A->D->value[16*i+3] * x[4*i+0] + A->D->value[16*i+7] * x[4*i+1] + A->D->value[16*i+11] * x[4*i+2] + A->D->value[16*i+15] * x[4*i+3];
655 			js = A->L->bptr[i];
656 			je = A->L->bptr[i+1];
657 			for(j=js;j<je;j++)
658 			{
659 				jj  = A->L->bindex[j];
660 				t0 +=  A->L->value[j*16+ 0] * x[jj*4+0];
661 				t1 +=  A->L->value[j*16+ 1] * x[jj*4+0];
662 				t2 +=  A->L->value[j*16+ 2] * x[jj*4+0];
663 				t3 +=  A->L->value[j*16+ 3] * x[jj*4+0];
664 				t0 +=  A->L->value[j*16+ 4] * x[jj*4+1];
665 				t1 +=  A->L->value[j*16+ 5] * x[jj*4+1];
666 				t2 +=  A->L->value[j*16+ 6] * x[jj*4+1];
667 				t3 +=  A->L->value[j*16+ 7] * x[jj*4+1];
668 				t0 +=  A->L->value[j*16+ 8] * x[jj*4+2];
669 				t1 +=  A->L->value[j*16+ 9] * x[jj*4+2];
670 				t2 +=  A->L->value[j*16+10] * x[jj*4+2];
671 				t3 +=  A->L->value[j*16+11] * x[jj*4+2];
672 				t0 +=  A->L->value[j*16+12] * x[jj*4+3];
673 				t1 +=  A->L->value[j*16+13] * x[jj*4+3];
674 				t2 +=  A->L->value[j*16+14] * x[jj*4+3];
675 				t3 +=  A->L->value[j*16+15] * x[jj*4+3];
676 			}
677 			js = A->U->bptr[i];
678 			je = A->U->bptr[i+1];
679 			for(j=js;j<je;j++)
680 			{
681 				jj  = A->U->bindex[j];
682 				t0 +=  A->U->value[j*16+ 0] * x[jj*4+0];
683 				t1 +=  A->U->value[j*16+ 1] * x[jj*4+0];
684 				t2 +=  A->U->value[j*16+ 2] * x[jj*4+0];
685 				t3 +=  A->U->value[j*16+ 3] * x[jj*4+0];
686 				t0 +=  A->U->value[j*16+ 4] * x[jj*4+1];
687 				t1 +=  A->U->value[j*16+ 5] * x[jj*4+1];
688 				t2 +=  A->U->value[j*16+ 6] * x[jj*4+1];
689 				t3 +=  A->U->value[j*16+ 7] * x[jj*4+1];
690 				t0 +=  A->U->value[j*16+ 8] * x[jj*4+2];
691 				t1 +=  A->U->value[j*16+ 9] * x[jj*4+2];
692 				t2 +=  A->U->value[j*16+10] * x[jj*4+2];
693 				t3 +=  A->U->value[j*16+11] * x[jj*4+2];
694 				t0 +=  A->U->value[j*16+12] * x[jj*4+3];
695 				t1 +=  A->U->value[j*16+13] * x[jj*4+3];
696 				t2 +=  A->U->value[j*16+14] * x[jj*4+3];
697 				t3 +=  A->U->value[j*16+15] * x[jj*4+3];
698 			}
699 			y[4*i+0] = t0;
700 			y[4*i+1] = t1;
701 			y[4*i+2] = t2;
702 			y[4*i+3] = t3;
703 		}
704 	}
705 	else
706 	{
707 		#ifdef _OPENMP
708 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
709 		#endif
710 		for(i=0; i<nr; i++)
711 		{
712 			js = A->bptr[i];
713 			je = A->bptr[i+1];
714 			t0 = 0.0;
715 			t1 = 0.0;
716 			t2 = 0.0;
717 			t3 = 0.0;
718 			for(j=js;j<je;j++)
719 			{
720 				jj  = A->bindex[j];
721 				t0 +=  A->value[j*16+ 0] * x[jj*4+0];
722 				t1 +=  A->value[j*16+ 1] * x[jj*4+0];
723 				t2 +=  A->value[j*16+ 2] * x[jj*4+0];
724 				t3 +=  A->value[j*16+ 3] * x[jj*4+0];
725 				t0 +=  A->value[j*16+ 4] * x[jj*4+1];
726 				t1 +=  A->value[j*16+ 5] * x[jj*4+1];
727 				t2 +=  A->value[j*16+ 6] * x[jj*4+1];
728 				t3 +=  A->value[j*16+ 7] * x[jj*4+1];
729 				t0 +=  A->value[j*16+ 8] * x[jj*4+2];
730 				t1 +=  A->value[j*16+ 9] * x[jj*4+2];
731 				t2 +=  A->value[j*16+10] * x[jj*4+2];
732 				t3 +=  A->value[j*16+11] * x[jj*4+2];
733 				t0 +=  A->value[j*16+12] * x[jj*4+3];
734 				t1 +=  A->value[j*16+13] * x[jj*4+3];
735 				t2 +=  A->value[j*16+14] * x[jj*4+3];
736 				t3 +=  A->value[j*16+15] * x[jj*4+3];
737 			}
738 			y[4*i+0] = t0;
739 			y[4*i+1] = t1;
740 			y[4*i+2] = t2;
741 			y[4*i+3] = t3;
742 		}
743 	}
744 }
745 
lis_matvec_bsr_4x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])746 void lis_matvec_bsr_4x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
747 {
748 	LIS_INT i,j,js,je,jj;
749 	LIS_INT nr;
750 	LIS_SCALAR t0,t1,t2,t3;
751 
752 	nr   = A->nr;
753 
754 	#ifdef _OPENMP
755 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
756 	#endif
757 	for(i=0; i<nr; i++)
758 	{
759 		js = A->bptr[i];
760 		je = A->bptr[i+1];
761 		t0 = 0.0;
762 		t1 = 0.0;
763 		t2 = 0.0;
764 		t3 = 0.0;
765 		for(j=js;j<je;j++)
766 		{
767 			jj  = A->bindex[j];
768 			t0 +=  A->value[j*4+ 0] * x[jj];
769 			t1 +=  A->value[j*4+ 1] * x[jj];
770 			t2 +=  A->value[j*4+ 2] * x[jj];
771 			t3 +=  A->value[j*4+ 3] * x[jj];
772 		}
773 		y[4*i+0] = t0;
774 		y[4*i+1] = t1;
775 		y[4*i+2] = t2;
776 		y[4*i+3] = t3;
777 	}
778 }
779 
lis_matvec_bsr_4x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])780 void lis_matvec_bsr_4x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
781 {
782 	LIS_INT i,j,js,je,jj;
783 	LIS_INT nr;
784 	LIS_SCALAR t0,t1,t2,t3;
785 
786 	nr   = A->nr;
787 
788 	#ifdef _OPENMP
789 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
790 	#endif
791 	for(i=0; i<nr; i++)
792 	{
793 		js = A->bptr[i];
794 		je = A->bptr[i+1];
795 		t0 = 0.0;
796 		t1 = 0.0;
797 		t2 = 0.0;
798 		t3 = 0.0;
799 		for(j=js;j<je;j++)
800 		{
801 			jj  = A->bindex[j];
802 			t0 +=  A->value[j*8+ 0] * x[jj*2+0];
803 			t1 +=  A->value[j*8+ 1] * x[jj*2+0];
804 			t2 +=  A->value[j*8+ 2] * x[jj*2+0];
805 			t3 +=  A->value[j*8+ 3] * x[jj*2+0];
806 			t0 +=  A->value[j*8+ 4] * x[jj*2+1];
807 			t1 +=  A->value[j*8+ 5] * x[jj*2+1];
808 			t2 +=  A->value[j*8+ 6] * x[jj*2+1];
809 			t3 +=  A->value[j*8+ 7] * x[jj*2+1];
810 		}
811 		y[4*i+0] = t0;
812 		y[4*i+1] = t1;
813 		y[4*i+2] = t2;
814 		y[4*i+3] = t3;
815 	}
816 }
817 
lis_matvec_bsr_4x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])818 void lis_matvec_bsr_4x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
819 {
820 	LIS_INT i,j,js,je,jj;
821 	LIS_INT nr;
822 	LIS_SCALAR t0,t1,t2,t3;
823 
824 	nr   = A->nr;
825 
826 	#ifdef _OPENMP
827 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
828 	#endif
829 	for(i=0; i<nr; i++)
830 	{
831 		js = A->bptr[i];
832 		je = A->bptr[i+1];
833 		t0 = 0.0;
834 		t1 = 0.0;
835 		t2 = 0.0;
836 		t3 = 0.0;
837 		for(j=js;j<je;j++)
838 		{
839 			jj  = A->bindex[j];
840 			t0 +=  A->value[j*12+ 0] * x[jj*3+0];
841 			t1 +=  A->value[j*12+ 1] * x[jj*3+0];
842 			t2 +=  A->value[j*12+ 2] * x[jj*3+0];
843 			t3 +=  A->value[j*12+ 3] * x[jj*3+0];
844 			t0 +=  A->value[j*12+ 4] * x[jj*3+1];
845 			t1 +=  A->value[j*12+ 5] * x[jj*3+1];
846 			t2 +=  A->value[j*12+ 6] * x[jj*3+1];
847 			t3 +=  A->value[j*12+ 7] * x[jj*3+1];
848 			t0 +=  A->value[j*12+ 8] * x[jj*3+2];
849 			t1 +=  A->value[j*12+ 9] * x[jj*3+2];
850 			t2 +=  A->value[j*12+10] * x[jj*3+2];
851 			t3 +=  A->value[j*12+11] * x[jj*3+2];
852 		}
853 		y[4*i+0] = t0;
854 		y[4*i+1] = t1;
855 		y[4*i+2] = t2;
856 		y[4*i+3] = t3;
857 	}
858 }
859 
lis_matvech_bsr(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])860 void lis_matvech_bsr(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
861 {
862 	LIS_INT i,j,k;
863 	LIS_INT bi,bj,bc,bs;
864 	LIS_INT nr,bnr,bnc;
865 	LIS_INT n,np;
866 	#ifdef _OPENMP
867 		LIS_INT nprocs,my_rank;
868 		LIS_SCALAR t;
869 		LIS_SCALAR *w;
870 	#endif
871 
872 	n   = A->n;
873 	np  = A->np;
874 	nr  = A->nr;
875 	bnr = A->bnr;
876 	bnc = A->bnc;
877 	bs  = bnr*bnc;
878 	if( A->is_splited )
879 	{
880 		for(i=0;i<n;i++)
881 		{
882 			y[i] = 0.0;
883 		}
884 		for(bi=0;bi<nr;bi++)
885 		{
886 			k    = bi*bs;
887 			for(j=0;j<bnc;j++)
888 			{
889 				for(i=0;i<bnr;i++)
890 				{
891 					y[bi*bnr+j] += conj(A->D->value[k++]) * x[bi*bnr+i];
892 				}
893 			}
894 		}
895 		for(bi=0;bi<nr;bi++)
896 		{
897 			for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
898 			{
899 				bj   = A->L->bindex[bc] * bnc;
900 				k    = bc*bs;
901 				for(j=0;j<bnc;j++)
902 				{
903 					for(i=0;i<bnr;i++)
904 					{
905 						y[bj+j] += conj(A->L->value[k]) * x[bi*bnr+i];
906 						k++;
907 					}
908 				}
909 			}
910 			for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
911 			{
912 				bj   = A->U->bindex[bc] * bnc;
913 				k    = bc*bs;
914 				for(j=0;j<bnc;j++)
915 				{
916 					for(i=0;i<bnr;i++)
917 					{
918 						y[bj+j] += conj(A->U->value[k]) * x[bi*bnr+i];
919 						k++;
920 					}
921 				}
922 			}
923 		}
924 	}
925 	else
926 	{
927 		#ifdef _OPENMP
928 			nprocs = omp_get_max_threads();
929 			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_bsr::w" );
930 
931 			#pragma omp parallel private(bi,bc,bj,i,j,k,my_rank)
932 			{
933 				my_rank = omp_get_thread_num();
934 
935 				#pragma omp for
936 				for(j=0;j<nprocs;j++)
937 				{
938 					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
939 				}
940 				#pragma omp for
941 				for(bi=0;bi<nr;bi++)
942 				{
943 					for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
944 					{
945 						bj   = my_rank*np + A->bindex[bc] * bnc;
946 						k    = bc*bs;
947 						for(j=0;j<bnc;j++)
948 						{
949 							for(i=0;i<bnr;i++)
950 							{
951 								w[bj+j] += conj(A->value[k]) * x[bi*bnr+i];
952 								k++;
953 							}
954 						}
955 					}
956 				}
957 				#pragma omp barrier
958 				#pragma omp for
959 				for(i=0;i<np;i++)
960 				{
961 					t = 0.0;
962 					for(j=0;j<nprocs;j++)
963 					{
964 						t += w[j*np+i];
965 					}
966 					y[i] = t;
967 				}
968 			}
969 			lis_free(w);
970 		#else
971 			for(i=0; i<n; i++)
972 			{
973 				y[i] = 0.0;
974 			}
975 			for(bi=0;bi<nr;bi++)
976 			{
977 				for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
978 				{
979 					bj   = A->bindex[bc] * bnc;
980 					k    = bc*bs;
981 					for(j=0;j<bnc;j++)
982 					{
983 						for(i=0;i<bnr;i++)
984 						{
985 							y[bj+j] += conj(A->value[k]) * x[bi*bnr+i];
986 							k++;
987 						}
988 					}
989 				}
990 			}
991 		#endif
992 	}
993 }
994