1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2 
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions are met:
5    1. Redistributions of source code must retain the above copyright
6       notice, this list of conditions and the following disclaimer.
7    2. Redistributions in binary form must reproduce the above copyright
8       notice, this list of conditions and the following disclaimer in the
9       documentation and/or other materials provided with the distribution.
10    3. Neither the name of the project nor the names of its contributors
11       may be used to endorse or promote products derived from this software
12       without specific prior written permission.
13 
14    THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18    PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24    POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 #ifdef HAVE_CONFIG_H
28 	#include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 	#include "lis_config_win.h"
32 #endif
33 #endif
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38         #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <stdarg.h>
42 #ifdef _OPENMP
43 	#include <omp.h>
44 #endif
45 #ifdef USE_MPI
46 	#include <mpi.h>
47 #endif
48 #include "lislib.h"
49 
50 LIS_MATVEC_XXX lis_matvec_bsc_xxx[4][4] = {
51 	{lis_matvec_bsc_1x1, lis_matvec_bsc_1x2, lis_matvec_bsc_1x3, lis_matvec_bsc_1x4},
52 	{lis_matvec_bsc_2x1, lis_matvec_bsc_2x2, lis_matvec_bsc_2x3, lis_matvec_bsc_2x4},
53 	{lis_matvec_bsc_3x1, lis_matvec_bsc_3x2, lis_matvec_bsc_3x3, lis_matvec_bsc_3x4},
54 	{lis_matvec_bsc_4x1, lis_matvec_bsc_4x2, lis_matvec_bsc_4x3, lis_matvec_bsc_4x4}
55 };
56 
lis_matvec_bsc(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])57 void lis_matvec_bsc(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
58 {
59 	LIS_INT i,j,k;
60 	LIS_INT bi,bj,bc,bs;
61 	LIS_INT nr,nc,bnr,bnc;
62 	LIS_INT n;
63 	LIS_SCALAR t;
64 
65 	n   = A->n;
66 	nr  = A->nr;
67 	nc  = A->nc;
68 	bnr = A->bnr;
69 	bnc = A->bnc;
70 	bs  = bnr*bnc;
71 	if( A->is_splited )
72 	{
73 		#ifdef _OPENMP
74 		#pragma omp parallel for private(bi,i,j,t,k)
75 		#endif
76 		for(bi=0;bi<nr;bi++)
77 		{
78 			k = 0;
79 			for(i=0;i<bnr;i++)
80 			{
81 				t = 0.0;
82 				for(j=0;j<bnc;j++)
83 				{
84 					t += A->D->value[bi*bs+ j*bnr + i] * x[bi*bnr+j];
85 					k++;
86 				}
87 				y[bi*bnr+i] = t;
88 			}
89 		}
90 		#ifdef _OPENMP
91 		#pragma omp parallel for private(i,j,k,bi,bj,bc)
92 		#endif
93 		for(bi=0;bi<nc;bi++)
94 		{
95 			for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
96 			{
97 				bj   = A->L->bindex[bc] * bnr;
98 				k    = bc*bs;
99 				for(j=0;j<bnc;j++)
100 				{
101 					for(i=0;i<bnr;i++)
102 					{
103 						y[bj+i] += A->L->value[k] * x[bi*bnc+j];
104 						k++;
105 					}
106 				}
107 			}
108 			for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
109 			{
110 				bj   = A->U->bindex[bc] * bnr;
111 				k    = bc*bs;
112 				for(j=0;j<bnc;j++)
113 				{
114 					for(i=0;i<bnr;i++)
115 					{
116 						y[bj+i] += A->U->value[k] * x[bi*bnc+j];
117 						k++;
118 					}
119 				}
120 			}
121 		}
122 	}
123 	else
124 	{
125 		#ifdef _OPENMP
126 		#pragma omp parallel for private(i)
127 		#endif
128 		for(i=0; i<n; i++)
129 		{
130 			y[i] = 0.0;
131 		}
132 		#ifdef _OPENMP
133 		#pragma omp parallel for private(i,j,k,bi,bj,bc)
134 		#endif
135 		for(bi=0;bi<nc;bi++)
136 		{
137 			for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
138 			{
139 				bj   = A->bindex[bc] * bnr;
140 				k    = bc*bs;
141 				for(j=0;j<bnc;j++)
142 				{
143 					for(i=0;i<bnr;i++)
144 					{
145 						y[bj+i] += A->value[k] * x[bi*bnc+j];
146 						k++;
147 					}
148 				}
149 			}
150 		}
151 	}
152 }
153 
lis_matvec_bsc_1x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])154 void lis_matvec_bsc_1x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
155 {
156 	LIS_INT i,j,js,je,jj;
157 	LIS_INT nr;
158 	LIS_SCALAR t0;
159 
160 	nr   = A->nr;
161 	#ifdef _OPENMP
162 	#pragma omp parallel for private(i,j,jj,js,je,t0)
163 	#endif
164 	for(i=0; i<nr; i++)
165 	{
166 		js = A->bptr[i];
167 		je = A->bptr[i+1];
168 		t0 = 0.0;
169 		for(j=js;j<je;j++)
170 		{
171 			jj  = A->bindex[j];
172 			t0 += A->value[j] * x[jj];
173 		}
174 		y[i] = t0;
175 	}
176 }
177 
lis_matvec_bsc_1x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])178 void lis_matvec_bsc_1x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
179 {
180 	LIS_INT i,j,js,je,jj;
181 	LIS_INT nr;
182 	LIS_SCALAR t0;
183 
184 	nr   = A->nr;
185 
186 	#ifdef _OPENMP
187 	#pragma omp parallel for private(i,j,jj,js,je,t0)
188 	#endif
189 	for(i=0; i<nr; i++)
190 	{
191 		js = A->bptr[i];
192 		je = A->bptr[i+1];
193 		t0 = 0.0;
194 		for(j=js;j<je;j++)
195 		{
196 			jj  = A->bindex[j];
197 			t0 += A->value[j*2+0] * x[jj*2+0];
198 			t0 += A->value[j*2+1] * x[jj*2+1];
199 		}
200 		y[i] = t0;
201 	}
202 }
203 
lis_matvec_bsc_1x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])204 void lis_matvec_bsc_1x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
205 {
206 	LIS_INT i,j,js,je,jj;
207 	LIS_INT nr;
208 	LIS_SCALAR t0;
209 
210 	nr   = A->nr;
211 
212 	#ifdef _OPENMP
213 	#pragma omp parallel for private(i,j,jj,js,je,t0)
214 	#endif
215 	for(i=0; i<nr; i++)
216 	{
217 		js = A->bptr[i];
218 		je = A->bptr[i+1];
219 		t0 = 0.0;
220 		for(j=js;j<je;j++)
221 		{
222 			jj  = A->bindex[j];
223 			t0 += A->value[j*3+0] * x[jj*3+0];
224 			t0 += A->value[j*3+1] * x[jj*3+1];
225 			t0 += A->value[j*3+2] * x[jj*3+2];
226 		}
227 		y[i] = t0;
228 	}
229 }
230 
lis_matvec_bsc_1x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])231 void lis_matvec_bsc_1x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
232 {
233 	LIS_INT i,j,js,je,jj;
234 	LIS_INT nr;
235 	LIS_SCALAR t0;
236 
237 	nr   = A->nr;
238 
239 	#ifdef _OPENMP
240 	#pragma omp parallel for private(i,j,jj,js,je,t0)
241 	#endif
242 	for(i=0; i<nr; i++)
243 	{
244 		js = A->bptr[i];
245 		je = A->bptr[i+1];
246 		t0 = 0.0;
247 		for(j=js;j<je;j++)
248 		{
249 			jj  = A->bindex[j];
250 			t0 += A->value[j*4+0] * x[jj*4+0];
251 			t0 += A->value[j*4+1] * x[jj*4+1];
252 			t0 += A->value[j*4+2] * x[jj*4+2];
253 			t0 += A->value[j*4+3] * x[jj*4+3];
254 		}
255 		y[i] = t0;
256 	}
257 }
258 
lis_matvec_bsc_2x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])259 void lis_matvec_bsc_2x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
260 {
261 	LIS_INT i,j,js,je,jj;
262 	LIS_INT nr;
263 	LIS_SCALAR t0,t1;
264 
265 	nr   = A->nr;
266 
267 	if( A->is_splited )
268 	{
269 		#ifdef _OPENMP
270 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
271 		#endif
272 		for(i=0; i<nr; i++)
273 		{
274 			t0 = A->D->value[4*i+0] * x[2*i+0] + A->D->value[4*i+2] * x[2*i+1];
275 			t1 = A->D->value[4*i+1] * x[2*i+0] + A->D->value[4*i+3] * x[2*i+1];
276 			js = A->L->bptr[i];
277 			je = A->L->bptr[i+1];
278 			for(j=js;j<je;j++)
279 			{
280 				jj  = A->L->bindex[j];
281 				t0 += A->L->value[j*4+0] * x[jj*2+0];
282 				t1 += A->L->value[j*4+1] * x[jj*2+0];
283 				t0 += A->L->value[j*4+2] * x[jj*2+1];
284 				t1 += A->L->value[j*4+3] * x[jj*2+1];
285 			}
286 			js = A->U->bptr[i];
287 			je = A->U->bptr[i+1];
288 			for(j=js;j<je;j++)
289 			{
290 				jj  = A->U->bindex[j];
291 				t0 += A->U->value[j*4+0] * x[jj*2+0];
292 				t1 += A->U->value[j*4+1] * x[jj*2+0];
293 				t0 += A->U->value[j*4+2] * x[jj*2+1];
294 				t1 += A->U->value[j*4+3] * x[jj*2+1];
295 			}
296 			y[2*i+0] = t0;
297 			y[2*i+1] = t1;
298 		}
299 	}
300 	else
301 	{
302 		#ifdef _OPENMP
303 		#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
304 		#endif
305 		for(i=0; i<nr; i++)
306 		{
307 			js = A->bptr[i];
308 			je = A->bptr[i+1];
309 			t0 = 0.0;
310 			t1 = 0.0;
311 			for(j=js;j<je;j++)
312 			{
313 				jj  = A->bindex[j];
314 				t0 += A->value[j*4+0] * x[jj*2+0];
315 				t1 += A->value[j*4+1] * x[jj*2+0];
316 				t0 += A->value[j*4+2] * x[jj*2+1];
317 				t1 += A->value[j*4+3] * x[jj*2+1];
318 			}
319 			y[2*i+0] = t0;
320 			y[2*i+1] = t1;
321 		}
322 	}
323 }
324 
lis_matvec_bsc_2x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])325 void lis_matvec_bsc_2x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
326 {
327 	LIS_INT i,j,js,je,jj;
328 	LIS_INT nr;
329 	LIS_SCALAR t0,t1;
330 
331 	nr   = A->nr;
332 
333 	#ifdef _OPENMP
334 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
335 	#endif
336 	for(i=0; i<nr; i++)
337 	{
338 		js = A->bptr[i];
339 		je = A->bptr[i+1];
340 		t0 = 0.0;
341 		t1 = 0.0;
342 		for(j=js;j<je;j++)
343 		{
344 			jj  = A->bindex[j];
345 			t0 += A->value[j*2+0] * x[jj];
346 			t1 += A->value[j*2+1] * x[jj];
347 		}
348 		y[2*i+0] = t0;
349 		y[2*i+1] = t1;
350 	}
351 }
352 
lis_matvec_bsc_2x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])353 void lis_matvec_bsc_2x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
354 {
355 	LIS_INT i,j,js,je,jj;
356 	LIS_INT nr;
357 	LIS_SCALAR t0,t1;
358 
359 	nr   = A->nr;
360 
361 	#ifdef _OPENMP
362 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
363 	#endif
364 	for(i=0; i<nr; i++)
365 	{
366 		js = A->bptr[i];
367 		je = A->bptr[i+1];
368 		t0 = 0.0;
369 		t1 = 0.0;
370 		for(j=js;j<je;j++)
371 		{
372 			jj  = A->bindex[j];
373 			t0 += A->value[j*6+0] * x[jj*3+0];
374 			t1 += A->value[j*6+1] * x[jj*3+0];
375 			t0 += A->value[j*6+2] * x[jj*3+1];
376 			t1 += A->value[j*6+3] * x[jj*3+1];
377 			t0 += A->value[j*6+4] * x[jj*3+2];
378 			t1 += A->value[j*6+5] * x[jj*3+2];
379 		}
380 		y[2*i+0] = t0;
381 		y[2*i+1] = t1;
382 	}
383 }
384 
lis_matvec_bsc_2x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])385 void lis_matvec_bsc_2x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
386 {
387 	LIS_INT i,j,js,je,jj;
388 	LIS_INT nr;
389 	LIS_SCALAR t0,t1;
390 
391 	nr   = A->nr;
392 
393 	#ifdef _OPENMP
394 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1)
395 	#endif
396 	for(i=0; i<nr; i++)
397 	{
398 		js = A->bptr[i];
399 		je = A->bptr[i+1];
400 		t0 = 0.0;
401 		t1 = 0.0;
402 		for(j=js;j<je;j++)
403 		{
404 			jj  = A->bindex[j];
405 			t0 +=  A->value[j*8+0] * x[jj*4+0];
406 			t1 +=  A->value[j*8+1] * x[jj*4+0];
407 			t0 +=  A->value[j*8+2] * x[jj*4+1];
408 			t1 +=  A->value[j*8+3] * x[jj*4+1];
409 			t0 +=  A->value[j*8+4] * x[jj*4+2];
410 			t1 +=  A->value[j*8+5] * x[jj*4+2];
411 			t0 +=  A->value[j*8+6] * x[jj*4+3];
412 			t1 +=  A->value[j*8+7] * x[jj*4+3];
413 		}
414 		y[2*i+0] = t0;
415 		y[2*i+1] = t1;
416 	}
417 }
418 
lis_matvec_bsc_3x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])419 void lis_matvec_bsc_3x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
420 {
421 	LIS_INT i,j,js,je,jj;
422 	LIS_INT nr;
423 	LIS_SCALAR t0,t1,t2;
424 
425 	nr   = A->nr;
426 
427 	#ifdef _OPENMP
428 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
429 	#endif
430 	for(i=0; i<nr; i++)
431 	{
432 		js = A->bptr[i];
433 		je = A->bptr[i+1];
434 		t0 = 0.0;
435 		t1 = 0.0;
436 		t2 = 0.0;
437 		for(j=js;j<je;j++)
438 		{
439 			jj  = A->bindex[j];
440 			t0 +=  A->value[j*9+0] * x[jj*3+0];
441 			t1 +=  A->value[j*9+1] * x[jj*3+0];
442 			t2 +=  A->value[j*9+2] * x[jj*3+0];
443 			t0 +=  A->value[j*9+3] * x[jj*3+1];
444 			t1 +=  A->value[j*9+4] * x[jj*3+1];
445 			t2 +=  A->value[j*9+5] * x[jj*3+1];
446 			t0 +=  A->value[j*9+6] * x[jj*3+2];
447 			t1 +=  A->value[j*9+7] * x[jj*3+2];
448 			t2 +=  A->value[j*9+8] * x[jj*3+2];
449 		}
450 		y[3*i+0] = t0;
451 		y[3*i+1] = t1;
452 		y[3*i+2] = t2;
453 	}
454 }
455 
lis_matvec_bsc_3x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])456 void lis_matvec_bsc_3x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
457 {
458 	LIS_INT i,j,js,je,jj,ii;
459 	LIS_INT nr;
460 	LIS_SCALAR t0,t1,t2;
461 
462 	nr   = A->nr;
463 
464 	#ifdef _OPENMP
465 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,ii)
466 	#endif
467 	for(i=0; i<nr; i++)
468 	{
469 		ii = 3*i;
470 		js = A->bptr[i];
471 		je = A->bptr[i+1];
472 		t0 = 0.0;
473 		t1 = 0.0;
474 		t2 = 0.0;
475 		for(j=js;j<je;j++)
476 		{
477 			jj  = A->bindex[j];
478 			t0 +=  A->value[j*3+0] * x[jj];
479 			t1 +=  A->value[j*3+1] * x[jj];
480 			t2 +=  A->value[j*3+2] * x[jj];
481 		}
482 		y[ii+0] = t0;
483 		y[ii+1] = t1;
484 		y[ii+2] = t2;
485 	}
486 }
487 
lis_matvec_bsc_3x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])488 void lis_matvec_bsc_3x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
489 {
490 	LIS_INT i,j,js,je,jj;
491 	LIS_INT nr;
492 	LIS_SCALAR t0,t1,t2;
493 
494 	nr   = A->nr;
495 
496 	#ifdef _OPENMP
497 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
498 	#endif
499 	for(i=0; i<nr; i++)
500 	{
501 		js = A->bptr[i];
502 		je = A->bptr[i+1];
503 		t0 = 0.0;
504 		t1 = 0.0;
505 		t2 = 0.0;
506 		for(j=js;j<je;j++)
507 		{
508 			jj  = A->bindex[j];
509 			t0 +=  A->value[j*6+0] * x[jj*2+0];
510 			t1 +=  A->value[j*6+1] * x[jj*2+0];
511 			t2 +=  A->value[j*6+2] * x[jj*2+0];
512 			t0 +=  A->value[j*6+3] * x[jj*2+1];
513 			t1 +=  A->value[j*6+4] * x[jj*2+1];
514 			t2 +=  A->value[j*6+5] * x[jj*2+1];
515 		}
516 		y[3*i+0] = t0;
517 		y[3*i+1] = t1;
518 		y[3*i+2] = t2;
519 	}
520 }
521 
lis_matvec_bsc_3x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])522 void lis_matvec_bsc_3x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
523 {
524 	LIS_INT i,j,js,je,jj;
525 	LIS_INT nr;
526 	LIS_SCALAR t0,t1,t2;
527 
528 	nr   = A->nr;
529 
530 	#ifdef _OPENMP
531 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2)
532 	#endif
533 	for(i=0; i<nr; i++)
534 	{
535 		js = A->bptr[i];
536 		je = A->bptr[i+1];
537 		t0 = 0.0;
538 		t1 = 0.0;
539 		t2 = 0.0;
540 		for(j=js;j<je;j++)
541 		{
542 			jj  = A->bindex[j];
543 			t0 +=  A->value[j*12+ 0] * x[jj*4+0];
544 			t1 +=  A->value[j*12+ 1] * x[jj*4+0];
545 			t2 +=  A->value[j*12+ 2] * x[jj*4+0];
546 			t0 +=  A->value[j*12+ 3] * x[jj*4+1];
547 			t1 +=  A->value[j*12+ 4] * x[jj*4+1];
548 			t2 +=  A->value[j*12+ 5] * x[jj*4+1];
549 			t0 +=  A->value[j*12+ 6] * x[jj*4+2];
550 			t1 +=  A->value[j*12+ 7] * x[jj*4+2];
551 			t2 +=  A->value[j*12+ 8] * x[jj*4+2];
552 			t0 +=  A->value[j*12+ 9] * x[jj*4+3];
553 			t1 +=  A->value[j*12+10] * x[jj*4+3];
554 			t2 +=  A->value[j*12+11] * x[jj*4+3];
555 		}
556 		y[3*i+0] = t0;
557 		y[3*i+1] = t1;
558 		y[3*i+2] = t2;
559 	}
560 }
561 
lis_matvec_bsc_4x4(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])562 void lis_matvec_bsc_4x4(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
563 {
564 	LIS_INT i,j,js,je,jj;
565 	LIS_INT nr;
566 	LIS_SCALAR t0,t1,t2,t3;
567 
568 	nr   = A->nr;
569 
570 	#ifdef _OPENMP
571 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
572 	#endif
573 	for(i=0; i<nr; i++)
574 	{
575 		js = A->bptr[i];
576 		je = A->bptr[i+1];
577 		t0 = 0.0;
578 		t1 = 0.0;
579 		t2 = 0.0;
580 		t3 = 0.0;
581 		for(j=js;j<je;j++)
582 		{
583 			jj  = A->bindex[j];
584 			t0 +=  A->value[j*16+ 0] * x[jj*4+0];
585 			t1 +=  A->value[j*16+ 1] * x[jj*4+0];
586 			t2 +=  A->value[j*16+ 2] * x[jj*4+0];
587 			t3 +=  A->value[j*16+ 3] * x[jj*4+0];
588 			t0 +=  A->value[j*16+ 4] * x[jj*4+1];
589 			t1 +=  A->value[j*16+ 5] * x[jj*4+1];
590 			t2 +=  A->value[j*16+ 6] * x[jj*4+1];
591 			t3 +=  A->value[j*16+ 7] * x[jj*4+1];
592 			t0 +=  A->value[j*16+ 8] * x[jj*4+2];
593 			t1 +=  A->value[j*16+ 9] * x[jj*4+2];
594 			t2 +=  A->value[j*16+10] * x[jj*4+2];
595 			t3 +=  A->value[j*16+11] * x[jj*4+2];
596 			t0 +=  A->value[j*16+12] * x[jj*4+3];
597 			t1 +=  A->value[j*16+13] * x[jj*4+3];
598 			t2 +=  A->value[j*16+14] * x[jj*4+3];
599 			t3 +=  A->value[j*16+15] * x[jj*4+3];
600 		}
601 		y[4*i+0] = t0;
602 		y[4*i+1] = t1;
603 		y[4*i+2] = t2;
604 		y[4*i+3] = t3;
605 	}
606 }
607 
lis_matvec_bsc_4x1(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])608 void lis_matvec_bsc_4x1(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
609 {
610 	LIS_INT i,j,js,je,jj;
611 	LIS_INT nr;
612 	LIS_SCALAR t0,t1,t2,t3;
613 
614 	nr   = A->nr;
615 
616 	#ifdef _OPENMP
617 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
618 	#endif
619 	for(i=0; i<nr; i++)
620 	{
621 		js = A->bptr[i];
622 		je = A->bptr[i+1];
623 		t0 = 0.0;
624 		t1 = 0.0;
625 		t2 = 0.0;
626 		t3 = 0.0;
627 		for(j=js;j<je;j++)
628 		{
629 			jj  = A->bindex[j];
630 			t0 +=  A->value[j*4+ 0] * x[jj];
631 			t1 +=  A->value[j*4+ 1] * x[jj];
632 			t2 +=  A->value[j*4+ 2] * x[jj];
633 			t3 +=  A->value[j*4+ 3] * x[jj];
634 		}
635 		y[4*i+0] = t0;
636 		y[4*i+1] = t1;
637 		y[4*i+2] = t2;
638 		y[4*i+3] = t3;
639 	}
640 }
641 
lis_matvec_bsc_4x2(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])642 void lis_matvec_bsc_4x2(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
643 {
644 	LIS_INT i,j,js,je,jj;
645 	LIS_INT nr;
646 	LIS_SCALAR t0,t1,t2,t3;
647 
648 	nr   = A->nr;
649 
650 	#ifdef _OPENMP
651 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
652 	#endif
653 	for(i=0; i<nr; i++)
654 	{
655 		js = A->bptr[i];
656 		je = A->bptr[i+1];
657 		t0 = 0.0;
658 		t1 = 0.0;
659 		t2 = 0.0;
660 		t3 = 0.0;
661 		for(j=js;j<je;j++)
662 		{
663 			jj  = A->bindex[j];
664 			t0 +=  A->value[j*8+ 0] * x[jj*2+0];
665 			t1 +=  A->value[j*8+ 1] * x[jj*2+0];
666 			t2 +=  A->value[j*8+ 2] * x[jj*2+0];
667 			t3 +=  A->value[j*8+ 3] * x[jj*2+0];
668 			t0 +=  A->value[j*8+ 4] * x[jj*2+1];
669 			t1 +=  A->value[j*8+ 5] * x[jj*2+1];
670 			t2 +=  A->value[j*8+ 6] * x[jj*2+1];
671 			t3 +=  A->value[j*8+ 7] * x[jj*2+1];
672 		}
673 		y[4*i+0] = t0;
674 		y[4*i+1] = t1;
675 		y[4*i+2] = t2;
676 		y[4*i+3] = t3;
677 	}
678 }
679 
lis_matvec_bsc_4x3(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])680 void lis_matvec_bsc_4x3(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
681 {
682 	LIS_INT i,j,js,je,jj;
683 	LIS_INT nr;
684 	LIS_SCALAR t0,t1,t2,t3;
685 
686 	nr   = A->nr;
687 
688 	#ifdef _OPENMP
689 	#pragma omp parallel for private(i,j,jj,js,je,t0,t1,t2,t3)
690 	#endif
691 	for(i=0; i<nr; i++)
692 	{
693 		js = A->bptr[i];
694 		je = A->bptr[i+1];
695 		t0 = 0.0;
696 		t1 = 0.0;
697 		t2 = 0.0;
698 		t3 = 0.0;
699 		for(j=js;j<je;j++)
700 		{
701 			jj  = A->bindex[j];
702 			t0 +=  A->value[j*12+ 0] * x[jj*3+0];
703 			t1 +=  A->value[j*12+ 1] * x[jj*3+0];
704 			t2 +=  A->value[j*12+ 2] * x[jj*3+0];
705 			t3 +=  A->value[j*12+ 3] * x[jj*3+0];
706 			t0 +=  A->value[j*12+ 4] * x[jj*3+1];
707 			t1 +=  A->value[j*12+ 5] * x[jj*3+1];
708 			t2 +=  A->value[j*12+ 6] * x[jj*3+1];
709 			t3 +=  A->value[j*12+ 7] * x[jj*3+1];
710 			t0 +=  A->value[j*12+ 8] * x[jj*3+2];
711 			t1 +=  A->value[j*12+ 9] * x[jj*3+2];
712 			t2 +=  A->value[j*12+10] * x[jj*3+2];
713 			t3 +=  A->value[j*12+11] * x[jj*3+2];
714 		}
715 		y[4*i+0] = t0;
716 		y[4*i+1] = t1;
717 		y[4*i+2] = t2;
718 		y[4*i+3] = t3;
719 	}
720 }
721 
lis_matvech_bsc(LIS_MATRIX A,LIS_SCALAR x[],LIS_SCALAR y[])722 void lis_matvech_bsc(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
723 {
724 	LIS_INT i,j,k;
725 	LIS_INT bi,bj,bc,bs;
726 	LIS_INT nr,nc,bnr,bnc;
727 	LIS_INT n,np;
728 	#ifdef _OPENMP
729 		LIS_INT nprocs;
730 		LIS_SCALAR t;
731 		LIS_SCALAR *w;
732 	#endif
733 
734 	n   = A->n;
735 	np  = A->np;
736 	nr  = A->nr;
737 	nc  = A->nc;
738 	bnr = A->bnr;
739 	bnc = A->bnc;
740 	bs  = bnr*bnc;
741 	if( A->is_splited )
742 	{
743 		for(i=0;i<n;i++)
744 		{
745 			y[i] = 0.0;
746 		}
747 		for(bi=0;bi<nr;bi++)
748 		{
749 			k    = bi*bs;
750 			for(j=0;j<bnc;j++)
751 			{
752 				for(i=0;i<bnr;i++)
753 				{
754 					y[bi*bnr+j] += conj(A->D->value[k++]) * x[bi*bnr+i];
755 				}
756 			}
757 		}
758 		for(bi=0;bi<nc;bi++)
759 		{
760 			for(bc=A->L->bptr[bi];bc<A->L->bptr[bi+1];bc++)
761 			{
762 				bj   = A->L->bindex[bc] * bnr;
763 				k    = bc*bs;
764 				for(j=0;j<bnc;j++)
765 				{
766 					for(i=0;i<bnr;i++)
767 					{
768 						y[bj+j] += conj(A->L->value[k]) * x[bi*bnr+i];
769 						k++;
770 					}
771 				}
772 			}
773 			for(bc=A->U->bptr[bi];bc<A->U->bptr[bi+1];bc++)
774 			{
775 				bj   = A->U->bindex[bc] * bnr;
776 				k    = bc*bs;
777 				for(j=0;j<bnc;j++)
778 				{
779 					for(i=0;i<bnr;i++)
780 					{
781 						y[bj+j] += conj(A->U->value[k]) * x[bi*bnr+i];
782 						k++;
783 					}
784 				}
785 			}
786 		}
787 	}
788 	else
789 	{
790 		#ifdef _OPENMP
791 			nprocs = omp_get_max_threads();
792 			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_bsc::w" );
793 
794 			#pragma omp parallel private(bi,bc,bj,i,j,k)
795 			{
796 				#pragma omp for
797 				for(j=0;j<nprocs;j++)
798 				{
799 					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
800 				}
801 				#pragma omp for
802 				for(bi=0;bi<nc;bi++)
803 				{
804 					for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
805 					{
806 						bj   = A->bindex[bc] * bnr;
807 						k    = bc*bs;
808 						for(j=0;j<bnc;j++)
809 						{
810 							for(i=0;i<bnr;i++)
811 							{
812 								w[bi*bnc+j] += conj(A->value[k]) * x[bj+i];
813 								k++;
814 							}
815 						}
816 					}
817 				}
818 				#pragma omp barrier
819 				#pragma omp for
820 				for(i=0;i<np;i++)
821 				{
822 					t = 0.0;
823 					for(j=0;j<nprocs;j++)
824 					{
825 						t += w[j*np+i];
826 					}
827 					y[i] = t;
828 				}
829 			}
830 			lis_free(w);
831 		#else
832 			for(i=0; i<n; i++)
833 			{
834 				y[i] = 0.0;
835 			}
836 			for(bi=0;bi<nc;bi++)
837 			{
838 				for(bc=A->bptr[bi];bc<A->bptr[bi+1];bc++)
839 				{
840 					bj   = A->bindex[bc] * bnr;
841 					k    = bc*bs;
842 					for(j=0;j<bnc;j++)
843 					{
844 						for(i=0;i<bnr;i++)
845 						{
846 							y[bi*bnc+j] += conj(A->value[k]) * x[bj+i];
847 							k++;
848 						}
849 					}
850 				}
851 			}
852 		#endif
853 	}
854 }
855