1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2 
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions are met:
5    1. Redistributions of source code must retain the above copyright
6       notice, this list of conditions and the following disclaimer.
7    2. Redistributions in binary form must reproduce the above copyright
8       notice, this list of conditions and the following disclaimer in the
9       documentation and/or other materials provided with the distribution.
10    3. Neither the name of the project nor the names of its contributors
11       may be used to endorse or promote products derived from this software
12       without specific prior written permission.
13 
14    THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18    PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24    POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 /*
28  * This subroutine is made based on ITSOL.
29  *
30  * http://www-users.cs.umn.edu/~saad/software/ITSOL/
31  *
32  */
33 
34 #ifdef HAVE_CONFIG_H
35 	#include "lis_config.h"
36 #else
37 #ifdef HAVE_CONFIG_WIN_H
38 	#include "lis_config_win.h"
39 #endif
40 #endif
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 #ifdef HAVE_MALLOC_H
45         #include <malloc.h>
46 #endif
47 #include <string.h>
48 #include <stdarg.h>
49 #include <math.h>
50 #ifdef _OPENMP
51 	#include <omp.h>
52 #endif
53 #ifdef USE_MPI
54 	#include <mpi.h>
55 #endif
56 #include "lislib.h"
57 
58 /************************************************
59  * lis_precon_create
60  * lis_psolve
61  * lis_psolveh
62  ************************************************/
63 
64 #define BSR_ENABLE 0
65 #undef __FUNC__
66 #define __FUNC__ "lis_precon_create_iluc"
lis_precon_create_iluc(LIS_SOLVER solver,LIS_PRECON precon)67 LIS_INT lis_precon_create_iluc(LIS_SOLVER solver, LIS_PRECON precon)
68 {
69 	LIS_INT	storage,block,err;
70 	LIS_MATRIX A,B;
71 
72 	LIS_DEBUG_FUNC_IN;
73 
74 #ifdef BSR_ENABLE
75 	storage     = solver->options[LIS_OPTIONS_STORAGE];
76 	block       = solver->options[LIS_OPTIONS_STORAGE_BLOCK];
77 
78 	if( storage==LIS_MATRIX_BSR )
79 	{
80 		if( solver->A->matrix_type!=storage )
81 		{
82 			err = lis_matrix_convert_self(solver);
83 			if( err ) return err;
84 		}
85 	}
86 #endif
87 
88 	switch( solver->A->matrix_type )
89 	{
90 	case LIS_MATRIX_CSR:
91 		err = lis_matrix_split(solver->A);
92 		if( err )
93 		{
94 			return err;
95 		}
96 		err = lis_precon_create_iluc_csr(solver,precon);
97 		break;
98 #ifdef BSR_ENABLE
99 	case LIS_MATRIX_BSR:
100 		err = lis_matrix_split(solver->A);
101 		if( err )
102 		{
103 			return err;
104 		}
105 		err = lis_precon_create_iluc_bsr(solver,precon);
106 		lis_psolve_xxx[LIS_PRECON_TYPE_ILUC]  = lis_psolve_iluc_bsr;
107 		break;
108 #endif
109 	default:
110 		A = solver->A;
111 		err = lis_matrix_duplicate(A,&B);
112 		if( err ) return err;
113 		lis_matrix_set_type(B,LIS_MATRIX_CSR);
114 		err = lis_matrix_convert(A,B);
115 		if( err ) return err;
116 		solver->A = B;
117 		err = lis_matrix_split(solver->A);
118 		if( err )
119 		{
120 			return err;
121 		}
122 		err = lis_precon_create_iluc_csr(solver,precon);
123 		lis_matrix_destroy(B);
124 		solver->A = A;
125 		break;
126 	}
127 
128 	LIS_DEBUG_FUNC_OUT;
129     return LIS_SUCCESS;
130 }
131 
132 
133 #undef __FUNC__
134 #define __FUNC__ "lis_precon_create_iluc_csr"
lis_precon_create_iluc_csr(LIS_SOLVER solver,LIS_PRECON precon)135 LIS_INT lis_precon_create_iluc_csr(LIS_SOLVER solver, LIS_PRECON precon)
136 {
137 #ifdef _OPENMP
138 	LIS_INT	err;
139 	LIS_INT	i,j,k,l,ii,jj,kk,ll;
140 	LIS_INT	is,ie,my_rank,nprocs;
141 	LIS_INT	n,nnz,lfil,len;
142 	LIS_INT	cz,cw;
143 	LIS_INT	*iw,*iw2,*wc,*wl,*iz,*zc,*zl;
144 	LIS_INT	*index,*ptr;
145 	LIS_SCALAR val;
146 	LIS_REAL t,tol,toldd;
147 	LIS_SCALAR *z,*w,*tmp;
148 	LIS_SCALAR *value;
149 	LIS_SCALAR m;
150 	LIS_MATRIX A;
151 	LIS_MATRIX_ILU L,U;
152 	LIS_VECTOR D;
153 
154 	LIS_DEBUG_FUNC_IN;
155 
156 
157 	A      = solver->A;
158 	n      = A->n;
159 	tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
160 	m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
161 	lfil   = (LIS_INT)((double)A->nnz/(2.0*n))*m;
162 	nprocs = omp_get_max_threads();
163 
164 	L      = NULL;
165 	U      = NULL;
166 
167 
168 	err = lis_matrix_ilu_create(n,1,&L);
169 	if( err ) return err;
170 	err = lis_matrix_ilu_create(n,1,&U);
171 	if( err ) return err;
172 	err = lis_matrix_ilu_setCR(L);
173 	if( err ) return err;
174 	err = lis_matrix_ilu_setCR(U);
175 	if( err ) return err;
176 	err = lis_vector_duplicate(A,&D);
177 	if( err )
178 	{
179 		return err;
180 	}
181 
182 	tmp   = (LIS_SCALAR *)lis_malloc(nprocs*n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
183 	if( tmp==NULL )
184 	{
185 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
186 		return LIS_OUT_OF_MEMORY;
187 	}
188 	w   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
189 	if( w==NULL )
190 	{
191 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
192 		return LIS_OUT_OF_MEMORY;
193 	}
194 	z   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
195 	if( z==NULL )
196 	{
197 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
198 		return LIS_OUT_OF_MEMORY;
199 	}
200 
201 	iw   = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
202 	if( iw==NULL )
203 	{
204 		LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT));
205 		return LIS_OUT_OF_MEMORY;
206 	}
207 	iw2  = (LIS_INT *)lis_malloc( nprocs*(n+1)*sizeof(LIS_INT), "lis_precon_create_iluc_csr::iw2" );
208 	if( iw2==NULL )
209 	{
210 		LIS_SETERR_MEM(nprocs*(n+1)*sizeof(LIS_INT));
211 		return LIS_OUT_OF_MEMORY;
212 	}
213 	wc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
214 	if( wc==NULL )
215 	{
216 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
217 		return LIS_OUT_OF_MEMORY;
218 	}
219 	wl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
220 	if( wl==NULL )
221 	{
222 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
223 		return LIS_OUT_OF_MEMORY;
224 	}
225 	iz   = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
226 	if( iw==NULL )
227 	{
228 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
229 		return LIS_OUT_OF_MEMORY;
230 	}
231 	zc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
232 	if( wc==NULL )
233 	{
234 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
235 		return LIS_OUT_OF_MEMORY;
236 	}
237 	zl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
238 	if( wl==NULL )
239 	{
240 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
241 		return LIS_OUT_OF_MEMORY;
242 	}
243 
244 	nnz = A->L->nnz;
245 	err = lis_matrix_malloc_csr(n,nnz,&ptr,&index,&value);
246 
247 
248 	#pragma omp parallel private(i,ii,j,jj,k,kk,l,ll,is,ie,my_rank,cz,cw,toldd,t,nnz,len)
249 	{
250 		my_rank  = omp_get_thread_num();
251 		LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
252 
253 
254 		#pragma omp for
255 		for(i=0;i<nprocs;i++)
256 		{
257 			memset( &iw2[i*n], 0, n*sizeof(LIS_INT) );
258 		}
259 		#pragma omp for
260 		for(i=0;i<=n;i++)
261 		{
262 			ptr[i] = 0;
263 		}
264 		#pragma omp for
265 		for(i=0;i<n;i++)
266 		{
267 			for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
268 			{
269 				jj           = A->L->index[j];
270 /*				if( jj>=is && jj<ie )*/
271 				{
272 					iw2[my_rank*n + jj]++;
273 				}
274 			}
275 		}
276 		#pragma omp for
277 		for(j=0;j<n;j++)
278 		{
279 			k = 0;
280 			for(i=0;i<nprocs;i++)
281 			{
282 				k += iw2[i*n+j];
283 			}
284 			iw[j] = k;
285 		}
286 		#pragma omp single
287 		for(j=0;j<n;j++)
288 		{
289 			ptr[j+1] = ptr[j] + iw[j];
290 		}
291 		#pragma omp for
292 		for(j=0;j<n;j++)
293 		{
294 			k = ptr[j];
295 			for(i=0;i<nprocs;i++)
296 			{
297 				l = iw2[i*n+j];
298 				iw2[i*n+j] = k;
299 				k = l + k;
300 			}
301 		}
302 		#pragma omp for
303 		for(i=0;i<n;i++)
304 		{
305 			for(j=ptr[i];j<ptr[i+1];j++)
306 			{
307 				value[j] = 0.0;
308 				index[j] = 0;
309 			}
310 		}
311 		#pragma omp for
312 		for(i=0;i<n;i++)
313 		{
314 			for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
315 			{
316 				k        = A->L->index[j];
317 /*				if( k>=is && k<ie )*/
318 				{
319 					l        = iw2[my_rank*n+k];
320 					value[l] = A->L->value[j];
321 					index[l] = i;
322 					iw2[my_rank*n+k]++;
323 				}
324 			}
325 		}
326 
327 
328 		for(i=is;i<ie;i++)
329 		{
330 			D->value[i] = A->D->value[i];
331 			zl[i]       = -1;
332 			wl[i]       = -1;
333 			zc[i]       = 0;
334 			wc[i]       = 0;
335 		}
336 
337 
338 	for(k=is;k<ie;k++)
339 	{
340 		cz = 0;
341 		cw = 0;
342 		/* z_j = a_kj (j>=k) */
343 		for(j=A->U->ptr[k];j<A->U->ptr[k+1];j++)
344 		{
345 			jj       = A->U->index[j];
346 			if( jj<is || jj>=ie ) continue;
347 			#ifdef USE_MPI
348 				if( jj>n-1 ) continue;
349 			#endif
350 			z[jj]    = A->U->value[j];
351 			iz[my_rank*n+cz++] = jj;
352 			zc[jj]   = 1;
353 		}
354 
355 		/* w_j = a_jk (j>k) */
356 		for(j=ptr[k];j<ptr[k+1];j++)
357 		{
358 			jj       = index[j];
359 			if( jj<is || jj>=ie ) continue;
360 			w[jj]    = value[j];
361 			iw[my_rank*n+cw++] = jj;
362 			wc[jj]   = 1;
363 		}
364 
365 		/* z_j = z_j - l_ki*u_ij */
366 		i = wl[k];
367 		while( i>=is )
368 		{
369 			ii  = wc[i];
370 			kk  = zc[i];
371 			val = L->value[i][ii];
372 			for(j=kk;j<U->nnz[i];j++)
373 			{
374 				jj     = U->index[i][j];
375 				if( jj==k ) continue;
376 				if( zc[jj]==1 )
377 				{
378 					z[jj]   -= val*U->value[i][j];
379 				}
380 				else
381 				{
382 					z[jj]    = -val*U->value[i][j];
383 					iz[my_rank*n+cz++] = jj;
384 					zc[jj]   = 1;
385 				}
386 			}
387 			ii++;
388 			wc[i] = ii;
389 			if( ii<L->nnz[i] )
390 			{
391 				kk     = L->index[i][ii];
392 				ll     = i;
393 				i      = wl[ll];
394 				wl[ll] = wl[kk];
395 				wl[kk] = ll;
396 			}
397 			else
398 			{
399 				i      = wl[i];
400 			}
401 		}
402 
403 		/* w_j = w_j - l_ji*u_ik */
404 		i = zl[k];
405 		while( i>=is )
406 		{
407 			ii  = zc[i];
408 			kk  = wc[i];
409 			val = U->value[i][ii];
410 			for(j=kk;j<L->nnz[i];j++)
411 			{
412 				jj     = L->index[i][j];
413 				if( wc[jj]==1 )
414 				{
415 					w[jj]   -= val*L->value[i][j];
416 				}
417 				else
418 				{
419 					w[jj]    = -val*L->value[i][j];
420 					iw[my_rank*n+cw++] = jj;
421 					wc[jj]   = 1;
422 				}
423 			}
424 			ii++;
425 			zc[i] = ii;
426 			if( ii<U->nnz[i] )
427 			{
428 				kk     = U->index[i][ii];
429 				ll     = i;
430 				i      = zl[ll];
431 				zl[ll] = zl[kk];
432 				zl[kk] = ll;
433 			}
434 			else
435 			{
436 				i      = zl[i];
437 			}
438 		}
439 
440 		toldd       = fabs(D->value[k])*tol;
441 		D->value[k] = 1.0/D->value[k];
442 		t           = D->value[k];
443 		if( cz<cw )
444 		{
445 			for(j=0;j<cz;j++)
446 			{
447 				jj = iz[my_rank*n+j];
448 				if( wc[jj] )
449 				{
450 					D->value[jj] -= z[jj]*w[jj]*t;
451 				}
452 			}
453 		}
454 		else
455 		{
456 			for(j=0;j<cw;j++)
457 			{
458 				jj = iw[my_rank*n+j];
459 				if( zc[jj] )
460 				{
461 					D->value[jj] -= z[jj]*w[jj]*t;
462 				}
463 			}
464 		}
465 		/* drop U */
466 		nnz = 0;
467 		for(j=0;j<cz;j++)
468 		{
469 			jj = iz[my_rank*n+j];
470 			if( fabs(z[jj])>toldd )
471 			{
472 				iz[my_rank*n+nnz++] = jj;
473 			}
474 			else
475 			{
476 				z[k] = z[k] + fabs(z[jj]);
477 				zc[jj] = 0;
478 			}
479 		}
480 		len = _min(lfil,nnz);
481 		for(j=0;j<nnz;j++) tmp[my_rank*n+j] = fabs(z[is+j]);
482 		lis_sort_di(0,nnz-1,&tmp[my_rank*n],&iz[my_rank*n]);
483 		lis_sort_i(0,len-1,&iz[my_rank*n]);
484 		U->nnz[k] = len;
485 		if( len>0 )
486 		{
487 			U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
488 			U->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
489 		}
490 		for(j=0;j<len;j++)
491 		{
492 			jj = iz[my_rank*n+j];
493 			U->index[k][j] = jj;
494 			U->value[k][j] = z[jj];
495 		}
496 		for(j=len;j<cz;j++) zc[iz[my_rank*n+j]] = 0;
497 		cz = nnz;
498 
499 		/* drop L */
500 		nnz = 0;
501 		for(j=0;j<cw;j++)
502 		{
503 			jj = iw[my_rank*n+j];
504 			if( fabs(w[jj])>toldd )
505 			{
506 				iw[my_rank*n+nnz++] = jj;
507 			}
508 			else
509 			{
510 				wc[jj] = 0;
511 			}
512 		}
513 		len = _min(lfil,nnz);
514 		for(j=0;j<nnz;j++) tmp[my_rank*n+j] = fabs(w[is+j]);
515 		lis_sort_di(0,nnz-1,&tmp[my_rank*n],&iw[my_rank*n]);
516 		lis_sort_i(0,len-1,&iw[my_rank*n]);
517 		L->nnz[k] = len;
518 		if( len>0 )
519 		{
520 			L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
521 			L->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
522 		}
523 		for(j=0;j<len;j++)
524 		{
525 			jj = iw[my_rank*n+j];
526 			L->index[k][j] = jj;
527 			L->value[k][j] = w[jj]*t;
528 		}
529 		for(j=len;j<cw;j++) wc[iw[my_rank*n+j]] = 0;
530 		cw = nnz;
531 
532 		/**/
533 		for(j=0;j<cz;j++) zc[iz[my_rank*n+j]] = 0;
534 		for(j=0;j<cw;j++) wc[iw[my_rank*n+j]] = 0;
535 		if(U->nnz[k]>0)
536 		{
537 			jj     = iz[my_rank*n+0];
538 			zc[k]  = 0;
539 			zl[k]  = zl[jj];
540 			zl[jj] = k;
541 		}
542 		if(L->nnz[k]>0)
543 		{
544 			jj     = iw[my_rank*n+0];
545 			wc[k]  = 0;
546 			wl[k]  = wl[jj];
547 			wl[jj] = k;
548 		}
549 	}
550 	}
551 
552 	precon->L  = L;
553 	precon->U  = U;
554 	precon->D  = D;
555 
556 	lis_free2(13,tmp,w,z,iw,iw2,wc,wl,iz,zc,zl,ptr,index,value);
557 
558 	LIS_DEBUG_FUNC_OUT;
559 	return LIS_SUCCESS;
560 #else
561 	LIS_INT	err;
562 	LIS_INT	i,j,k,l,ii,jj,kk,ll;
563 	LIS_INT	n,nnz,annz,lfil,len;
564 	LIS_INT	cz,cw;
565 	LIS_INT	*iw,*wc,*wl,*iz,*zc,*zl;
566 	LIS_INT	*index,*ptr;
567 	LIS_SCALAR gamma,val;
568 	LIS_REAL t,tol,toldd;
569 	LIS_SCALAR *z,*w,*tmp;
570 	LIS_SCALAR *value;
571 	LIS_SCALAR m;
572 	LIS_MATRIX A;
573 	LIS_MATRIX_ILU L,U;
574 	LIS_VECTOR D;
575 
576 	LIS_DEBUG_FUNC_IN;
577 
578 
579 	A      = solver->A;
580 	n      = A->n;
581 	tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
582 	m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
583 	gamma  = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
584 	annz   = 10+A->nnz / A->n;
585 	lfil   = (LIS_INT)(((double)A->nnz/(2.0*n))*m);
586 
587 	L      = NULL;
588 	U      = NULL;
589 
590 
591 	err = lis_matrix_ilu_create(n,1,&L);
592 	if( err ) return err;
593 	err = lis_matrix_ilu_create(n,1,&U);
594 	if( err ) return err;
595 	err = lis_matrix_ilu_setCR(L);
596 	if( err ) return err;
597 	err = lis_matrix_ilu_setCR(U);
598 	if( err ) return err;
599 	err = lis_vector_duplicate(A,&D);
600 	if( err )
601 	{
602 		return err;
603 	}
604 
605 	tmp   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
606 	if( tmp==NULL )
607 	{
608 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
609 		return LIS_OUT_OF_MEMORY;
610 	}
611 	w   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
612 	if( w==NULL )
613 	{
614 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
615 		return LIS_OUT_OF_MEMORY;
616 	}
617 	z   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
618 	if( z==NULL )
619 	{
620 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
621 		return LIS_OUT_OF_MEMORY;
622 	}
623 
624 	iw   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
625 	if( iw==NULL )
626 	{
627 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
628 		return LIS_OUT_OF_MEMORY;
629 	}
630 	wc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
631 	if( wc==NULL )
632 	{
633 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
634 		return LIS_OUT_OF_MEMORY;
635 	}
636 	wl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
637 	if( wl==NULL )
638 	{
639 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
640 		return LIS_OUT_OF_MEMORY;
641 	}
642 	iz   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
643 	if( iw==NULL )
644 	{
645 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
646 		return LIS_OUT_OF_MEMORY;
647 	}
648 	zc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
649 	if( wc==NULL )
650 	{
651 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
652 		return LIS_OUT_OF_MEMORY;
653 	}
654 	zl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
655 	if( wl==NULL )
656 	{
657 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
658 		return LIS_OUT_OF_MEMORY;
659 	}
660 
661 	lis_matrix_sort_csr(A);
662 	nnz = A->L->nnz;
663 	err = lis_matrix_malloc_csr(n,nnz,&ptr,&index,&value);
664 
665 	for(i=0;i<n;i++) iw[i] = 0;
666 	for(i=0;i<n;i++)
667 	{
668 		for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
669 		{
670 			jj = A->L->index[j];
671 			iw[jj]++;
672 		}
673 	}
674 	ptr[0] = 0;
675 	for(i=0;i<n;i++)
676 	{
677 		ptr[i+1] = ptr[i] + iw[i];
678 		iw[i]    = ptr[i];
679 	}
680 	for(i=0;i<n;i++)
681 	{
682 		for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
683 		{
684 			k        = A->L->index[j];
685 			l        = iw[k];
686 			value[l] = A->L->value[j];
687 			index[l] = i;
688 			iw[k]++;
689 		}
690 	}
691 
692 
693 	for(i=0;i<n;i++)
694 	{
695 		D->value[i] = gamma * A->D->value[i];
696 		zl[i]       = -1;
697 		wl[i]       = -1;
698 		zc[i]       = 0;
699 		wc[i]       = 0;
700 	}
701 
702 
703 	for(k=0;k<n;k++)
704 	{
705 		cz = 0;
706 		cw = 0;
707 		/* z_j = a_kj (j>=k) */
708 		for(j=A->U->ptr[k];j<A->U->ptr[k+1];j++)
709 		{
710 			jj       = A->U->index[j];
711 			#ifdef USE_MPI
712 				if( jj>n-1 ) continue;
713 			#endif
714 			z[jj]    = A->U->value[j];
715 			iz[cz++] = jj;
716 			zc[jj]   = 1;
717 		}
718 
719 		/* w_j = a_jk (j>k) */
720 		for(j=ptr[k];j<ptr[k+1];j++)
721 		{
722 			jj       = index[j];
723 			w[jj]    = value[j];
724 			iw[cw++] = jj;
725 			wc[jj]   = 1;
726 		}
727 
728 		/* z_j = z_j - l_ki*u_ij */
729 		i = wl[k];
730 		while( i>=0 )
731 		{
732 			ii  = wc[i];
733 			kk  = zc[i];
734 			val = L->value[i][ii];
735 			for(j=kk;j<U->nnz[i];j++)
736 			{
737 				jj     = U->index[i][j];
738 				if( jj==k ) continue;
739 				if( zc[jj]==1 )
740 				{
741 					z[jj]   -= val*U->value[i][j];
742 				}
743 				else
744 				{
745 					z[jj]    = -val*U->value[i][j];
746 					iz[cz++] = jj;
747 					zc[jj]   = 1;
748 				}
749 			}
750 			ii++;
751 			wc[i] = ii;
752 			if( ii<L->nnz[i] )
753 			{
754 				kk     = L->index[i][ii];
755 				ll     = i;
756 				i      = wl[ll];
757 				wl[ll] = wl[kk];
758 				wl[kk] = ll;
759 			}
760 			else
761 			{
762 				i      = wl[i];
763 			}
764 		}
765 
766 		/* w_j = w_j - l_ji*u_ik */
767 		i = zl[k];
768 		while( i>=0 )
769 		{
770 			ii  = zc[i];
771 			kk  = wc[i];
772 			val = U->value[i][ii];
773 			for(j=kk;j<L->nnz[i];j++)
774 			{
775 				jj     = L->index[i][j];
776 				if( wc[jj]==1 )
777 				{
778 					w[jj]   -= val*L->value[i][j];
779 				}
780 				else
781 				{
782 					w[jj]    = -val*L->value[i][j];
783 					iw[cw++] = jj;
784 					wc[jj]   = 1;
785 				}
786 			}
787 			ii++;
788 			zc[i] = ii;
789 			if( ii<U->nnz[i] )
790 			{
791 				kk     = U->index[i][ii];
792 				ll     = i;
793 				i      = zl[ll];
794 				zl[ll] = zl[kk];
795 				zl[kk] = ll;
796 			}
797 			else
798 			{
799 				i      = zl[i];
800 			}
801 		}
802 
803 		toldd       = fabs(D->value[k])*tol;
804 		D->value[k] = 1.0/D->value[k];
805 		t           = D->value[k];
806 		if( cz<cw )
807 		{
808 			for(j=0;j<cz;j++)
809 			{
810 				jj = iz[j];
811 				if( wc[jj] )
812 				{
813 					D->value[jj] -= z[jj]*w[jj]*t;
814 				}
815 			}
816 		}
817 		else
818 		{
819 			for(j=0;j<cw;j++)
820 			{
821 				jj = iw[j];
822 				if( zc[jj] )
823 				{
824 					D->value[jj] -= z[jj]*w[jj]*t;
825 				}
826 			}
827 		}
828 		/* drop U */
829 		nnz = 0;
830 		for(j=0;j<cz;j++)
831 		{
832 			jj = iz[j];
833 			if( fabs(z[jj])>toldd )
834 			{
835 				iz[nnz++] = jj;
836 			}
837 			else
838 			{
839 				z[k] = z[k] + fabs(z[jj]);
840 				zc[jj] = 0;
841 			}
842 		}
843 		len = _min(lfil,nnz);
844 		for(j=0;j<nnz;j++) tmp[j] = fabs(z[j]);
845 		lis_sort_di(0,nnz-1,tmp,iz);
846 		lis_sort_i(0,len-1,iz);
847 		U->nnz[k] = len;
848 		if( len>0 )
849 		{
850 			U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
851 			U->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
852 		}
853 		for(j=0;j<len;j++)
854 		{
855 			jj = iz[j];
856 			U->index[k][j] = jj;
857 			U->value[k][j] = z[jj];
858 		}
859 		for(j=len;j<cz;j++) zc[iz[j]] = 0;
860 		cz = nnz;
861 
862 		/* drop L */
863 		nnz = 0;
864 		for(j=0;j<cw;j++)
865 		{
866 			jj = iw[j];
867 			if( fabs(w[jj])>toldd )
868 			{
869 				iw[nnz++] = jj;
870 			}
871 			else
872 			{
873 				wc[jj] = 0;
874 			}
875 		}
876 		len = _min(lfil,nnz);
877 		for(j=0;j<nnz;j++) tmp[j] = fabs(w[j]);
878 		lis_sort_di(0,nnz-1,tmp,iw);
879 		lis_sort_i(0,len-1,iw);
880 		L->nnz[k] = len;
881 		if( len>0 )
882 		{
883 			L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
884 			L->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
885 		}
886 		for(j=0;j<len;j++)
887 		{
888 			jj = iw[j];
889 			L->index[k][j] = jj;
890 			L->value[k][j] = w[jj]*t;
891 		}
892 		for(j=len;j<cw;j++) wc[iw[j]] = 0;
893 		cw = nnz;
894 
895 		/**/
896 		for(j=0;j<cz;j++) zc[iz[j]] = 0;
897 		for(j=0;j<cw;j++) wc[iw[j]] = 0;
898 		if(U->nnz[k]>0)
899 		{
900 			jj     = iz[0];
901 			zc[k]  = 0;
902 			zl[k]  = zl[jj];
903 			zl[jj] = k;
904 		}
905 		if(L->nnz[k]>0)
906 		{
907 			jj     = iw[0];
908 			wc[k]  = 0;
909 			wl[k]  = wl[jj];
910 			wl[jj] = k;
911 		}
912 	}
913 
914 	precon->L  = L;
915 	precon->U  = U;
916 	precon->D  = D;
917 
918 	lis_free2(12,tmp,w,z,iw,wc,wl,iz,zc,zl,ptr,index,value);
919 
920 	LIS_DEBUG_FUNC_OUT;
921 	return LIS_SUCCESS;
922 #endif
923 }
924 
925 #undef __FUNC__
926 #define __FUNC__ "lis_precon_create_iluc_bsr"
lis_precon_create_iluc_bsr(LIS_SOLVER solver,LIS_PRECON precon)927 LIS_INT lis_precon_create_iluc_bsr(LIS_SOLVER solver, LIS_PRECON precon)
928 {
929 	LIS_Comm comm;
930 	LIS_INT err;
931 	LIS_INT	i,j,k,l,ii,jj,kk,ll,bnr,bs;
932 	LIS_INT	n,nr,nnz,bnnz,lfil,len;
933 	LIS_INT	cz,cw;
934 	LIS_INT	*iw,*wc,*wl,*iz,*zc,*zl;
935 	LIS_INT	*index,*ptr;
936 	LIS_SCALAR gamma,val;
937 	LIS_REAL t,tol,toldd;
938 	LIS_SCALAR *z,*w,*tmp,wz[9],lu[9];
939 	LIS_SCALAR *value;
940 	LIS_SCALAR m,a;
941 	LIS_MATRIX A;
942 	LIS_MATRIX_ILU L,U;
943 	LIS_MATRIX_DIAG	D;
944 
945 	LIS_DEBUG_FUNC_IN;
946 
947 	comm = LIS_COMM_WORLD;
948 
949 	A      = solver->A;
950 	n      = A->n;
951 	nr     = A->nr;
952 	bnr    = A->bnr;
953 	bs     = bnr*bnr;
954 	tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
955 	m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
956 	gamma  = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
957 	lfil   = (LIS_INT)(((double)A->bnnz/(2.0*nr))*m);
958 
959 	L      = NULL;
960 	U      = NULL;
961 
962 
963 	err = lis_matrix_ilu_create(nr,bnr,&L);
964 	if( err ) return err;
965 	err = lis_matrix_ilu_create(nr,bnr,&U);
966 	if( err ) return err;
967 	err = lis_matrix_ilu_setCR(L);
968 	if( err ) return err;
969 	err = lis_matrix_ilu_setCR(U);
970 	if( err ) return err;
971 	err = lis_matrix_diag_duplicateM(A,&D);
972 	if( err )
973 	{
974 		return err;
975 	}
976 
977 	tmp   = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
978 	if( tmp==NULL )
979 	{
980 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
981 		return LIS_OUT_OF_MEMORY;
982 	}
983 	w   = (LIS_SCALAR *)lis_malloc(bs*nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
984 	if( w==NULL )
985 	{
986 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
987 		return LIS_OUT_OF_MEMORY;
988 	}
989 	z   = (LIS_SCALAR *)lis_malloc(bs*nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
990 	if( z==NULL )
991 	{
992 		LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
993 		return LIS_OUT_OF_MEMORY;
994 	}
995 
996 	iw   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
997 	if( iw==NULL )
998 	{
999 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1000 		return LIS_OUT_OF_MEMORY;
1001 	}
1002 	wc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
1003 	if( wc==NULL )
1004 	{
1005 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1006 		return LIS_OUT_OF_MEMORY;
1007 	}
1008 	wl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
1009 	if( wl==NULL )
1010 	{
1011 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1012 		return LIS_OUT_OF_MEMORY;
1013 	}
1014 	iz   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
1015 	if( iw==NULL )
1016 	{
1017 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1018 		return LIS_OUT_OF_MEMORY;
1019 	}
1020 	zc   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
1021 	if( wc==NULL )
1022 	{
1023 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1024 		return LIS_OUT_OF_MEMORY;
1025 	}
1026 	zl   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
1027 	if( wl==NULL )
1028 	{
1029 		LIS_SETERR_MEM(n*sizeof(LIS_INT));
1030 		return LIS_OUT_OF_MEMORY;
1031 	}
1032 
1033 	bnnz = A->L->bnnz;
1034 	err = lis_matrix_malloc_bsr(n,bnr,bnr,bnnz,&ptr,&index,&value);
1035 
1036 	for(i=0;i<nr;i++) iw[i] = 0;
1037 	for(i=0;i<nr;i++)
1038 	{
1039 		for(j=A->L->bptr[i];j<A->L->bptr[i+1];j++)
1040 		{
1041 			jj = A->L->bindex[j];
1042 			iw[jj]++;
1043 		}
1044 	}
1045 	ptr[0] = 0;
1046 	for(i=0;i<nr;i++)
1047 	{
1048 		ptr[i+1] = ptr[i] + iw[i];
1049 		iw[i]    = ptr[i];
1050 	}
1051 	for(i=0;i<nr;i++)
1052 	{
1053 		for(j=A->L->bptr[i];j<A->L->bptr[i+1];j++)
1054 		{
1055 			k        = A->L->bindex[j];
1056 			l        = iw[k];
1057 			memcpy(&value[bs*l],&A->L->value[bs*j],bs*sizeof(LIS_SCALAR));
1058 			index[l] = i;
1059 			iw[k]++;
1060 		}
1061 	}
1062 
1063 
1064 	switch(bnr)
1065 	{
1066 	case 1:
1067 		for(i=0;i<nr;i++)
1068 		{
1069 			D->value[i] = gamma*A->D->value[i];
1070 			zl[i]       = -1;
1071 			wl[i]       = -1;
1072 			zc[i]       = 0;
1073 			wc[i]       = 0;
1074 		}
1075 		break;
1076 	case 2:
1077 		for(i=0;i<nr;i++)
1078 		{
1079 			D->value[4*i+0] = gamma*A->D->value[4*i+0];
1080 			D->value[4*i+1] = gamma*A->D->value[4*i+1];
1081 			D->value[4*i+2] = gamma*A->D->value[4*i+2];
1082 			D->value[4*i+3] = gamma*A->D->value[4*i+3];
1083 			zl[i]       = -1;
1084 			wl[i]       = -1;
1085 			zc[i]       = 0;
1086 			wc[i]       = 0;
1087 		}
1088 		if( n%2!=0 )
1089 		{
1090 			D->value[4*(nr-1)+3] = 1.0;
1091 		}
1092 		break;
1093 	case 3:
1094 		for(i=0;i<nr;i++)
1095 		{
1096 			D->value[9*i+0] = gamma*A->D->value[9*i+0];
1097 			D->value[9*i+1] = gamma*A->D->value[9*i+1];
1098 			D->value[9*i+2] = gamma*A->D->value[9*i+2];
1099 			D->value[9*i+3] = gamma*A->D->value[9*i+3];
1100 			D->value[9*i+4] = gamma*A->D->value[9*i+4];
1101 			D->value[9*i+5] = gamma*A->D->value[9*i+5];
1102 			D->value[9*i+6] = gamma*A->D->value[9*i+6];
1103 			D->value[9*i+7] = gamma*A->D->value[9*i+7];
1104 			D->value[9*i+8] = gamma*A->D->value[9*i+8];
1105 			zl[i]       = -1;
1106 			wl[i]       = -1;
1107 			zc[i]       = 0;
1108 			wc[i]       = 0;
1109 		}
1110 			if( n%3==1 )
1111 			{
1112 				D->value[9*(nr-1)+4] = 1.0;
1113 				D->value[9*(nr-1)+8] = 1.0;
1114 			}
1115 			else if( n%3==2 )
1116 			{
1117 				D->value[9*(nr-1)+8] = 1.0;
1118 			}
1119 		break;
1120 	}
1121 
1122 
1123 	for(k=0;k<nr;k++)
1124 	{
1125 		cz = 0;
1126 		cw = 0;
1127 		/* z_j = a_kj (j>=k) */
1128 		for(j=A->U->bptr[k];j<A->U->bptr[k+1];j++)
1129 		{
1130 			jj       = A->U->bindex[j];
1131 			#ifdef USE_MPI
1132 				if( jj>nr-1 ) continue;
1133 			#endif
1134 #if 0
1135 			switch(bnr)
1136 			{
1137 			case 1:
1138 				toldd       = fabs(D->value[k]);
1139 				t           = fabs(A->U->value[j]);
1140 				break;
1141 			case 2:
1142 				toldd       = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1143 				toldd      += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1144 /*				toldd       = sqrt(toldd);*/
1145 				t       = fabs(A->U->value[4*j+0]) + fabs(A->U->value[4*j+1]);
1146 				t      += fabs(A->U->value[4*j+2]) + fabs(A->U->value[4*j+3]);
1147 /*				t       = sqrt(t);*/
1148 				break;
1149 			case 3:
1150 				toldd       = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1151 				toldd      += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1152 				toldd      += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1153 /*				toldd       = sqrt(toldd);*/
1154 				t       = fabs(A->U->value[9*j+0]) + fabs(A->U->value[9*j+1]) + fabs(A->U->value[9*j+2]);
1155 				t      += fabs(A->U->value[9*j+3]) + fabs(A->U->value[9*j+4]) + fabs(A->U->value[9*j+5]);
1156 				t      += fabs(A->U->value[9*j+6]) + fabs(A->U->value[9*j+7]) + fabs(A->U->value[9*j+8]);
1157 /*				t       = sqrt(t);*/
1158 				break;
1159 			}
1160 			if( toldd>t*1.0e-10 )
1161 			{
1162 				memcpy(&z[bs*jj],&A->U->value[bs*j],bs*sizeof(LIS_SCALAR));
1163 				iz[cz++] = jj;
1164 				zc[jj]   = 1;
1165 			}
1166 #else
1167 			memcpy(&z[bs*jj],&A->U->value[bs*j],bs*sizeof(LIS_SCALAR));
1168 			iz[cz++] = jj;
1169 			zc[jj]   = 1;
1170 #endif
1171 		}
1172 
1173 		/* w_j = a_jk (j>k) */
1174 		for(j=ptr[k];j<ptr[k+1];j++)
1175 		{
1176 			jj       = index[j];
1177 #if 0
1178 			switch(bnr)
1179 			{
1180 			case 1:
1181 				toldd       = fabs(D->value[k]);
1182 				t           = fabs(value[j]);
1183 				break;
1184 			case 2:
1185 				toldd       = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1186 				toldd      += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1187 /*				toldd       = sqrt(toldd);*/
1188 				t       = fabs(value[4*j+0]) + fabs(value[4*j+1]);
1189 				t      += fabs(value[4*j+2]) + fabs(value[4*j+3]);
1190 /*				t       = sqrt(t);*/
1191 				break;
1192 			case 3:
1193 				toldd       = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1194 				toldd      += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1195 				toldd      += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1196 /*				toldd       = sqrt(toldd);*/
1197 				t       = fabs(value[9*j+0]) + fabs(value[9*j+1]) + fabs(value[9*j+2]);
1198 				t      += fabs(value[9*j+3]) + fabs(value[9*j+4]) + fabs(value[9*j+5]);
1199 				t      += fabs(value[9*j+6]) + fabs(value[9*j+7]) + fabs(value[9*j+8]);
1200 /*				t       = sqrt(t);*/
1201 				break;
1202 			}
1203 			if( toldd>t*1.0e-10 )
1204 			{
1205 				memcpy(&w[bs*jj],&value[bs*j],bs*sizeof(LIS_SCALAR));
1206 				iw[cw++] = jj;
1207 				wc[jj]   = 1;
1208 			}
1209 #else
1210 			memcpy(&w[bs*jj],&value[bs*j],bs*sizeof(LIS_SCALAR));
1211 			iw[cw++] = jj;
1212 			wc[jj]   = 1;
1213 #endif
1214 		}
1215 
1216 		/* z_j = z_j - l_ki*u_ij */
1217 		i = wl[k];
1218 		while( i>=0 )
1219 		{
1220 			ii  = wc[i];
1221 			kk  = zc[i];
1222 			val = L->value[i][ii];
1223 			for(j=kk;j<U->nnz[i];j++)
1224 			{
1225 				jj     = U->index[i][j];
1226 #if 1
1227 				if( jj==k ) continue;
1228 				if( zc[jj]==1 )
1229 #else
1230 				if( jj==k )
1231 				{
1232 					switch(bnr)
1233 					{
1234 					case 1:
1235 						D->value[jj]   -= val*U->value[i][j];
1236 						break;
1237 					case 2:
1238 						D->value[4*jj+0] -= L->value[i][4*ii+0]*U->value[i][4*j+0];
1239 						D->value[4*jj+0] -= L->value[i][4*ii+2]*U->value[i][4*j+1];
1240 						D->value[4*jj+1] -= L->value[i][4*ii+1]*U->value[i][4*j+0];
1241 						D->value[4*jj+1] -= L->value[i][4*ii+3]*U->value[i][4*j+1];
1242 						D->value[4*jj+2] -= L->value[i][4*ii+0]*U->value[i][4*j+2];
1243 						D->value[4*jj+2] -= L->value[i][4*ii+2]*U->value[i][4*j+3];
1244 						D->value[4*jj+3] -= L->value[i][4*ii+1]*U->value[i][4*j+2];
1245 						D->value[4*jj+3] -= L->value[i][4*ii+3]*U->value[i][4*j+3];
1246 						break;
1247 					case 3:
1248 						D->value[9*jj+0] -= L->value[i][9*ii+0]*U->value[i][9*j+0] + L->value[i][9*ii+3]*U->value[i][9*j+1] + L->value[i][9*ii+6]*U->value[i][9*j+2];
1249 						D->value[9*jj+1] -= L->value[i][9*ii+1]*U->value[i][9*j+0] + L->value[i][9*ii+4]*U->value[i][9*j+1] + L->value[i][9*ii+7]*U->value[i][9*j+2];
1250 						D->value[9*jj+2] -= L->value[i][9*ii+2]*U->value[i][9*j+0] + L->value[i][9*ii+5]*U->value[i][9*j+1] + L->value[i][9*ii+8]*U->value[i][9*j+2];
1251 						D->value[9*jj+3] -= L->value[i][9*ii+0]*U->value[i][9*j+3] + L->value[i][9*ii+3]*U->value[i][9*j+4] + L->value[i][9*ii+6]*U->value[i][9*j+5];
1252 						D->value[9*jj+4] -= L->value[i][9*ii+1]*U->value[i][9*j+3] + L->value[i][9*ii+4]*U->value[i][9*j+4] + L->value[i][9*ii+7]*U->value[i][9*j+5];
1253 						D->value[9*jj+5] -= L->value[i][9*ii+2]*U->value[i][9*j+3] + L->value[i][9*ii+5]*U->value[i][9*j+4] + L->value[i][9*ii+8]*U->value[i][9*j+5];
1254 						D->value[9*jj+6] -= L->value[i][9*ii+0]*U->value[i][9*j+6] + L->value[i][9*ii+3]*U->value[i][9*j+7] + L->value[i][9*ii+6]*U->value[i][9*j+8];
1255 						D->value[9*jj+7] -= L->value[i][9*ii+1]*U->value[i][9*j+6] + L->value[i][9*ii+4]*U->value[i][9*j+7] + L->value[i][9*ii+7]*U->value[i][9*j+8];
1256 						D->value[9*jj+8] -= L->value[i][9*ii+2]*U->value[i][9*j+6] + L->value[i][9*ii+5]*U->value[i][9*j+7] + L->value[i][9*ii+8]*U->value[i][9*j+8];
1257 						break;
1258 					}
1259 				}
1260 				else if( zc[jj]==1 )
1261 #endif
1262 				{
1263 					switch(bnr)
1264 					{
1265 					case 1:
1266 						z[jj]   -= val*U->value[i][j];
1267 						break;
1268 					case 2:
1269 						z[4*jj+0] -= L->value[i][4*ii+0]*U->value[i][4*j+0];
1270 						z[4*jj+0] -= L->value[i][4*ii+2]*U->value[i][4*j+1];
1271 						z[4*jj+1] -= L->value[i][4*ii+1]*U->value[i][4*j+0];
1272 						z[4*jj+1] -= L->value[i][4*ii+3]*U->value[i][4*j+1];
1273 						z[4*jj+2] -= L->value[i][4*ii+0]*U->value[i][4*j+2];
1274 						z[4*jj+2] -= L->value[i][4*ii+2]*U->value[i][4*j+3];
1275 						z[4*jj+3] -= L->value[i][4*ii+1]*U->value[i][4*j+2];
1276 						z[4*jj+3] -= L->value[i][4*ii+3]*U->value[i][4*j+3];
1277 						break;
1278 					case 3:
1279 						z[9*jj+0] -= L->value[i][9*ii+0]*U->value[i][9*j+0] + L->value[i][9*ii+3]*U->value[i][9*j+1] + L->value[i][9*ii+6]*U->value[i][9*j+2];
1280 						z[9*jj+1] -= L->value[i][9*ii+1]*U->value[i][9*j+0] + L->value[i][9*ii+4]*U->value[i][9*j+1] + L->value[i][9*ii+7]*U->value[i][9*j+2];
1281 						z[9*jj+2] -= L->value[i][9*ii+2]*U->value[i][9*j+0] + L->value[i][9*ii+5]*U->value[i][9*j+1] + L->value[i][9*ii+8]*U->value[i][9*j+2];
1282 						z[9*jj+3] -= L->value[i][9*ii+0]*U->value[i][9*j+3] + L->value[i][9*ii+3]*U->value[i][9*j+4] + L->value[i][9*ii+6]*U->value[i][9*j+5];
1283 						z[9*jj+4] -= L->value[i][9*ii+1]*U->value[i][9*j+3] + L->value[i][9*ii+4]*U->value[i][9*j+4] + L->value[i][9*ii+7]*U->value[i][9*j+5];
1284 						z[9*jj+5] -= L->value[i][9*ii+2]*U->value[i][9*j+3] + L->value[i][9*ii+5]*U->value[i][9*j+4] + L->value[i][9*ii+8]*U->value[i][9*j+5];
1285 						z[9*jj+6] -= L->value[i][9*ii+0]*U->value[i][9*j+6] + L->value[i][9*ii+3]*U->value[i][9*j+7] + L->value[i][9*ii+6]*U->value[i][9*j+8];
1286 						z[9*jj+7] -= L->value[i][9*ii+1]*U->value[i][9*j+6] + L->value[i][9*ii+4]*U->value[i][9*j+7] + L->value[i][9*ii+7]*U->value[i][9*j+8];
1287 						z[9*jj+8] -= L->value[i][9*ii+2]*U->value[i][9*j+6] + L->value[i][9*ii+5]*U->value[i][9*j+7] + L->value[i][9*ii+8]*U->value[i][9*j+8];
1288 						break;
1289 					}
1290 				}
1291 				else
1292 				{
1293 					switch(bnr)
1294 					{
1295 					case 1:
1296 						z[jj]   = -val*U->value[i][j];
1297 						break;
1298 					case 2:
1299 						z[4*jj+0] = -L->value[i][4*ii+0]*U->value[i][4*j+0] - L->value[i][4*ii+2]*U->value[i][4*j+1];
1300 						z[4*jj+1] = -L->value[i][4*ii+1]*U->value[i][4*j+0] - L->value[i][4*ii+3]*U->value[i][4*j+1];
1301 						z[4*jj+2] = -L->value[i][4*ii+0]*U->value[i][4*j+2] - L->value[i][4*ii+2]*U->value[i][4*j+3];
1302 						z[4*jj+3] = -L->value[i][4*ii+1]*U->value[i][4*j+2] - L->value[i][4*ii+3]*U->value[i][4*j+3];
1303 						break;
1304 					case 3:
1305 						z[9*jj+0] = -L->value[i][9*ii+0]*U->value[i][9*j+0] - L->value[i][9*ii+3]*U->value[i][9*j+1] - L->value[i][9*ii+6]*U->value[i][9*j+2];
1306 						z[9*jj+1] = -L->value[i][9*ii+1]*U->value[i][9*j+0] - L->value[i][9*ii+4]*U->value[i][9*j+1] - L->value[i][9*ii+7]*U->value[i][9*j+2];
1307 						z[9*jj+2] = -L->value[i][9*ii+2]*U->value[i][9*j+0] - L->value[i][9*ii+5]*U->value[i][9*j+1] - L->value[i][9*ii+8]*U->value[i][9*j+2];
1308 						z[9*jj+3] = -L->value[i][9*ii+0]*U->value[i][9*j+3] - L->value[i][9*ii+3]*U->value[i][9*j+4] - L->value[i][9*ii+6]*U->value[i][9*j+5];
1309 						z[9*jj+4] = -L->value[i][9*ii+1]*U->value[i][9*j+3] - L->value[i][9*ii+4]*U->value[i][9*j+4] - L->value[i][9*ii+7]*U->value[i][9*j+5];
1310 						z[9*jj+5] = -L->value[i][9*ii+2]*U->value[i][9*j+3] - L->value[i][9*ii+5]*U->value[i][9*j+4] - L->value[i][9*ii+8]*U->value[i][9*j+5];
1311 						z[9*jj+6] = -L->value[i][9*ii+0]*U->value[i][9*j+6] - L->value[i][9*ii+3]*U->value[i][9*j+7] - L->value[i][9*ii+6]*U->value[i][9*j+8];
1312 						z[9*jj+7] = -L->value[i][9*ii+1]*U->value[i][9*j+6] - L->value[i][9*ii+4]*U->value[i][9*j+7] - L->value[i][9*ii+7]*U->value[i][9*j+8];
1313 						z[9*jj+8] = -L->value[i][9*ii+2]*U->value[i][9*j+6] - L->value[i][9*ii+5]*U->value[i][9*j+7] - L->value[i][9*ii+8]*U->value[i][9*j+8];
1314 						break;
1315 					}
1316 					iz[cz++] = jj;
1317 					zc[jj]   = 1;
1318 				}
1319 			}
1320 			ii++;
1321 			wc[i] = ii;
1322 			if( ii<L->nnz[i] )
1323 			{
1324 				kk     = L->index[i][ii];
1325 				ll     = i;
1326 				i      = wl[ll];
1327 				wl[ll] = wl[kk];
1328 				wl[kk] = ll;
1329 			}
1330 			else
1331 			{
1332 				i      = wl[i];
1333 			}
1334 		}
1335 
1336 		/* w_j = w_j - l_ji*u_ik */
1337 		i = zl[k];
1338 		while( i>=0 )
1339 		{
1340 			ii  = zc[i];
1341 			kk  = wc[i];
1342 			val = U->value[i][ii];
1343 			for(j=kk;j<L->nnz[i];j++)
1344 			{
1345 				jj     = L->index[i][j];
1346 				if( wc[jj]==1 )
1347 				{
1348 					switch(bnr)
1349 					{
1350 					case 1:
1351 						w[jj]   -= val*L->value[i][j];
1352 						break;
1353 					case 2:
1354 						w[4*jj+0] -= U->value[i][4*ii+0]*L->value[i][4*j+0];
1355 						w[4*jj+0] -= U->value[i][4*ii+2]*L->value[i][4*j+1];
1356 						w[4*jj+1] -= U->value[i][4*ii+1]*L->value[i][4*j+0];
1357 						w[4*jj+1] -= U->value[i][4*ii+3]*L->value[i][4*j+1];
1358 						w[4*jj+2] -= U->value[i][4*ii+0]*L->value[i][4*j+2];
1359 						w[4*jj+2] -= U->value[i][4*ii+2]*L->value[i][4*j+3];
1360 						w[4*jj+3] -= U->value[i][4*ii+1]*L->value[i][4*j+2];
1361 						w[4*jj+3] -= U->value[i][4*ii+3]*L->value[i][4*j+3];
1362 
1363 /*						w[4*jj+0] -= L->value[i][4*j+0]*U->value[i][4*ii+0];
1364 						w[4*jj+0] -= L->value[i][4*j+2]*U->value[i][4*ii+1];
1365 						w[4*jj+1] -= L->value[i][4*j+1]*U->value[i][4*ii+0];
1366 						w[4*jj+1] -= L->value[i][4*j+3]*U->value[i][4*ii+1];
1367 						w[4*jj+2] -= L->value[i][4*j+0]*U->value[i][4*ii+2];
1368 						w[4*jj+2] -= L->value[i][4*j+2]*U->value[i][4*ii+3];
1369 						w[4*jj+3] -= L->value[i][4*j+1]*U->value[i][4*ii+2];
1370 						w[4*jj+3] -= L->value[i][4*j+3]*U->value[i][4*ii+3];*/
1371 						break;
1372 					case 3:
1373 						w[9*jj+0] -= L->value[i][9*j+0]*U->value[i][9*ii+0] + L->value[i][9*j+3]*U->value[i][9*ii+1] + L->value[i][9*j+6]*U->value[i][9*ii+2];
1374 						w[9*jj+1] -= L->value[i][9*j+1]*U->value[i][9*ii+0] + L->value[i][9*j+4]*U->value[i][9*ii+1] + L->value[i][9*j+7]*U->value[i][9*ii+2];
1375 						w[9*jj+2] -= L->value[i][9*j+2]*U->value[i][9*ii+0] + L->value[i][9*j+5]*U->value[i][9*ii+1] + L->value[i][9*j+8]*U->value[i][9*ii+2];
1376 						w[9*jj+3] -= L->value[i][9*j+0]*U->value[i][9*ii+3] + L->value[i][9*j+3]*U->value[i][9*ii+4] + L->value[i][9*j+6]*U->value[i][9*ii+5];
1377 						w[9*jj+4] -= L->value[i][9*j+1]*U->value[i][9*ii+3] + L->value[i][9*j+4]*U->value[i][9*ii+4] + L->value[i][9*j+7]*U->value[i][9*ii+5];
1378 						w[9*jj+5] -= L->value[i][9*j+2]*U->value[i][9*ii+3] + L->value[i][9*j+5]*U->value[i][9*ii+4] + L->value[i][9*j+8]*U->value[i][9*ii+5];
1379 						w[9*jj+6] -= L->value[i][9*j+0]*U->value[i][9*ii+6] + L->value[i][9*j+3]*U->value[i][9*ii+7] + L->value[i][9*j+6]*U->value[i][9*ii+8];
1380 						w[9*jj+7] -= L->value[i][9*j+1]*U->value[i][9*ii+6] + L->value[i][9*j+4]*U->value[i][9*ii+7] + L->value[i][9*j+7]*U->value[i][9*ii+8];
1381 						w[9*jj+8] -= L->value[i][9*j+2]*U->value[i][9*ii+6] + L->value[i][9*j+5]*U->value[i][9*ii+7] + L->value[i][9*j+8]*U->value[i][9*ii+8];
1382 
1383 /*						w[9*jj+0] -= U->value[i][9*ii+0]*L->value[i][9*j+0] + U->value[i][9*ii+3]*L->value[i][9*j+1] + U->value[i][9*ii+6]*L->value[i][9*j+2];
1384 						w[9*jj+1] -= U->value[i][9*ii+1]*L->value[i][9*j+0] + U->value[i][9*ii+4]*L->value[i][9*j+1] + U->value[i][9*ii+7]*L->value[i][9*j+2];
1385 						w[9*jj+2] -= U->value[i][9*ii+2]*L->value[i][9*j+0] + U->value[i][9*ii+5]*L->value[i][9*j+1] + U->value[i][9*ii+8]*L->value[i][9*j+2];
1386 						w[9*jj+3] -= U->value[i][9*ii+0]*L->value[i][9*j+3] + U->value[i][9*ii+3]*L->value[i][9*j+4] + U->value[i][9*ii+6]*L->value[i][9*j+5];
1387 						w[9*jj+4] -= U->value[i][9*ii+1]*L->value[i][9*j+3] + U->value[i][9*ii+4]*L->value[i][9*j+4] + U->value[i][9*ii+7]*L->value[i][9*j+5];
1388 						w[9*jj+5] -= U->value[i][9*ii+2]*L->value[i][9*j+3] + U->value[i][9*ii+5]*L->value[i][9*j+4] + U->value[i][9*ii+8]*L->value[i][9*j+5];
1389 						w[9*jj+6] -= U->value[i][9*ii+0]*L->value[i][9*j+6] + U->value[i][9*ii+3]*L->value[i][9*j+7] + U->value[i][9*ii+6]*L->value[i][9*j+8];
1390 						w[9*jj+7] -= U->value[i][9*ii+1]*L->value[i][9*j+6] + U->value[i][9*ii+4]*L->value[i][9*j+7] + U->value[i][9*ii+7]*L->value[i][9*j+8];
1391 						w[9*jj+8] -= U->value[i][9*ii+2]*L->value[i][9*j+6] + U->value[i][9*ii+5]*L->value[i][9*j+7] + U->value[i][9*ii+8]*L->value[i][9*j+8];*/
1392 						break;
1393 					}
1394 				}
1395 				else
1396 				{
1397 					switch(bnr)
1398 					{
1399 					case 1:
1400 						w[jj]   = -val*L->value[i][j];
1401 						break;
1402 					case 2:
1403 						w[4*jj+0] = -U->value[i][4*ii+0]*L->value[i][4*j+0] - U->value[i][4*ii+2]*L->value[i][4*j+1];
1404 						w[4*jj+1] = -U->value[i][4*ii+1]*L->value[i][4*j+0] - U->value[i][4*ii+3]*L->value[i][4*j+1];
1405 						w[4*jj+2] = -U->value[i][4*ii+0]*L->value[i][4*j+2] - U->value[i][4*ii+2]*L->value[i][4*j+3];
1406 						w[4*jj+3] = -U->value[i][4*ii+1]*L->value[i][4*j+2] - U->value[i][4*ii+3]*L->value[i][4*j+3];
1407 
1408 /*						w[4*jj+0] = -L->value[i][4*j+0]*U->value[i][4*ii+0] - L->value[i][4*j+2]*U->value[i][4*ii+1];
1409 						w[4*jj+1] = -L->value[i][4*j+1]*U->value[i][4*ii+0] - L->value[i][4*j+3]*U->value[i][4*ii+1];
1410 						w[4*jj+2] = -L->value[i][4*j+0]*U->value[i][4*ii+2] - L->value[i][4*j+2]*U->value[i][4*ii+3];
1411 						w[4*jj+3] = -L->value[i][4*j+1]*U->value[i][4*ii+2] - L->value[i][4*j+3]*U->value[i][4*ii+3];*/
1412 						break;
1413 					case 3:
1414 						w[9*jj+0] = -L->value[i][9*j+0]*U->value[i][9*ii+0] - L->value[i][9*j+3]*U->value[i][9*ii+1] - L->value[i][9*j+6]*U->value[i][9*ii+2];
1415 						w[9*jj+1] = -L->value[i][9*j+1]*U->value[i][9*ii+0] - L->value[i][9*j+4]*U->value[i][9*ii+1] - L->value[i][9*j+7]*U->value[i][9*ii+2];
1416 						w[9*jj+2] = -L->value[i][9*j+2]*U->value[i][9*ii+0] - L->value[i][9*j+5]*U->value[i][9*ii+1] - L->value[i][9*j+8]*U->value[i][9*ii+2];
1417 						w[9*jj+3] = -L->value[i][9*j+0]*U->value[i][9*ii+3] - L->value[i][9*j+3]*U->value[i][9*ii+4] - L->value[i][9*j+6]*U->value[i][9*ii+5];
1418 						w[9*jj+4] = -L->value[i][9*j+1]*U->value[i][9*ii+3] - L->value[i][9*j+4]*U->value[i][9*ii+4] - L->value[i][9*j+7]*U->value[i][9*ii+5];
1419 						w[9*jj+5] = -L->value[i][9*j+2]*U->value[i][9*ii+3] - L->value[i][9*j+5]*U->value[i][9*ii+4] - L->value[i][9*j+8]*U->value[i][9*ii+5];
1420 						w[9*jj+6] = -L->value[i][9*j+0]*U->value[i][9*ii+6] - L->value[i][9*j+3]*U->value[i][9*ii+7] - L->value[i][9*j+6]*U->value[i][9*ii+8];
1421 						w[9*jj+7] = -L->value[i][9*j+1]*U->value[i][9*ii+6] - L->value[i][9*j+4]*U->value[i][9*ii+7] - L->value[i][9*j+7]*U->value[i][9*ii+8];
1422 						w[9*jj+8] = -L->value[i][9*j+2]*U->value[i][9*ii+6] - L->value[i][9*j+5]*U->value[i][9*ii+7] - L->value[i][9*j+8]*U->value[i][9*ii+8];
1423 
1424 /*						w[9*jj+0] = -U->value[i][9*ii+0]*L->value[i][9*j+0] - U->value[i][9*ii+3]*L->value[i][9*j+1] - U->value[i][9*ii+6]*L->value[i][9*j+2];
1425 						w[9*jj+1] = -U->value[i][9*ii+1]*L->value[i][9*j+0] - U->value[i][9*ii+4]*L->value[i][9*j+1] - U->value[i][9*ii+7]*L->value[i][9*j+2];
1426 						w[9*jj+2] = -U->value[i][9*ii+2]*L->value[i][9*j+0] - U->value[i][9*ii+5]*L->value[i][9*j+1] - U->value[i][9*ii+8]*L->value[i][9*j+2];
1427 						w[9*jj+3] = -U->value[i][9*ii+0]*L->value[i][9*j+3] - U->value[i][9*ii+3]*L->value[i][9*j+4] - U->value[i][9*ii+6]*L->value[i][9*j+5];
1428 						w[9*jj+4] = -U->value[i][9*ii+1]*L->value[i][9*j+3] - U->value[i][9*ii+4]*L->value[i][9*j+4] - U->value[i][9*ii+7]*L->value[i][9*j+5];
1429 						w[9*jj+5] = -U->value[i][9*ii+2]*L->value[i][9*j+3] - U->value[i][9*ii+5]*L->value[i][9*j+4] - U->value[i][9*ii+8]*L->value[i][9*j+5];
1430 						w[9*jj+6] = -U->value[i][9*ii+0]*L->value[i][9*j+6] - U->value[i][9*ii+3]*L->value[i][9*j+7] - U->value[i][9*ii+6]*L->value[i][9*j+8];
1431 						w[9*jj+7] = -U->value[i][9*ii+1]*L->value[i][9*j+6] - U->value[i][9*ii+4]*L->value[i][9*j+7] - U->value[i][9*ii+7]*L->value[i][9*j+8];
1432 						w[9*jj+8] = -U->value[i][9*ii+2]*L->value[i][9*j+6] - U->value[i][9*ii+5]*L->value[i][9*j+7] - U->value[i][9*ii+8]*L->value[i][9*j+8];*/
1433 						break;
1434 					}
1435 					iw[cw++] = jj;
1436 					wc[jj]   = 1;
1437 				}
1438 			}
1439 			ii++;
1440 			zc[i] = ii;
1441 			if( ii<U->nnz[i] )
1442 			{
1443 				kk     = U->index[i][ii];
1444 				ll     = i;
1445 				i      = zl[ll];
1446 				zl[ll] = zl[kk];
1447 				zl[kk] = ll;
1448 			}
1449 			else
1450 			{
1451 				i      = zl[i];
1452 			}
1453 		}
1454 
1455 		switch(bnr)
1456 		{
1457 		case 1:
1458 			toldd       = fabs(D->value[k])*tol;
1459 			break;
1460 		case 2:
1461 			toldd       = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1462 			toldd      += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1463 /*			toldd       = sqrt(toldd)*tol;*/
1464 			toldd       = toldd*tol;
1465 			break;
1466 		case 3:
1467 			toldd       = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1468 			toldd      += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1469 			toldd      += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1470 /*			toldd       = sqrt(toldd)*tol;*/
1471 			toldd       = toldd*tol;
1472 			break;
1473 		}
1474 /*		D->value[k] = 1.0/D->value[k];*/
1475 /*		t           = D->value[k];*/
1476 
1477 #if 1
1478 		memcpy(lu,&D->value[bs*k],bs*sizeof(LIS_SCALAR));
1479 		for(kk=0;kk<bnr;kk++)
1480 		{
1481 			lu[kk*bnr+kk] = 1.0 / lu[kk*bnr+kk];
1482 			for(ii=kk+1;ii<bnr;ii++)
1483 			{
1484 				a = lu[kk*bnr+ii] * lu[kk*bnr+kk];
1485 				for(jj=kk+1;jj<bnr;jj++)
1486 				{
1487 					lu[jj*bnr+ii] -= a * lu[jj*bnr+kk];
1488 				}
1489 				lu[kk*bnr+ii] = a;
1490 			}
1491 		}
1492 		memcpy(&D->value[bs*k],lu,bs*sizeof(LIS_SCALAR));
1493 		switch(bnr)
1494 		{
1495 		case 1:
1496 		  lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1497 		  lis_printf(comm,"k=%d %e\n",k,D->value[k]);
1498 		  break;
1499 		case 2:
1500 		  lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1501 		  lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+0],D->value[4*k+2]);
1502 		  lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+1],D->value[4*k+3]);
1503 		  break;
1504 		case 3:
1505 		  lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1506 		  lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+0],D->value[9*k+3],D->value[9*k+6]);
1507 		  lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+1],D->value[9*k+4],D->value[9*k+7]);
1508 		  lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+2],D->value[9*k+5],D->value[9*k+8]);
1509 		  break;
1510 		}
1511 #endif
1512 #if 0
1513 		switch(bnr)
1514 		{
1515 		case 1:
1516 			D->value[k] = 1.0/D->value[k];
1517 			break;
1518 		case 2:
1519 			/*
1520 				D[i] = |d0 d2|
1521 				       |d1 d3|
1522 				*/
1523 			t        = 1.0/(D->value[4*k]*D->value[4*k+3] - D->value[4*k+1]*D->value[4*k+2]);
1524 
1525 			a        = D->value[4*k];
1526 			lis_printf(comm,"k=%d t=%e\n",k,t);
1527 			lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+0],D->value[4*k+2]);
1528 			lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+1],D->value[4*k+3]);
1529 			D->value[4*k]   = t*D->value[4*k+3];
1530 			D->value[4*k+1] = -t*D->value[4*k+1];
1531 			D->value[4*k+2] = -t*D->value[4*k+2];
1532 			D->value[4*k+3] = t*a;
1533 			break;
1534 		case 3:
1535 			/*
1536 				D[i] = |d0 d3 d6|
1537 				    |d1 d4 d7|
1538 					|d2 d5 d8|
1539 				*/
1540 			t       = D->value[9*k]*D->value[9*k+4]*D->value[9*k+8] + D->value[9*k+1]*D->value[9*k+5]*D->value[9*k+6] + D->value[9*k+2]*D->value[9*k+3]*D->value[9*k+7];
1541 			t      -= D->value[9*k]*D->value[9*k+5]*D->value[9*k+7] + D->value[9*k+1]*D->value[9*k+3]*D->value[9*k+8] + D->value[9*k+2]*D->value[9*k+4]*D->value[9*k+6];
1542 			lis_printf(comm,"k=%d t=%e\n",k,t);
1543 			t       = 1.0 / t;
1544 			b[0]    = t*(D->value[9*k+4]*D->value[9*k+8] - D->value[9*k+5]*D->value[9*k+7]);
1545 			b[1]    = t*(D->value[9*k+5]*D->value[9*k+6] - D->value[9*k+3]*D->value[9*k+8]);
1546 			b[2]    = t*(D->value[9*k+3]*D->value[9*k+7] - D->value[9*k+4]*D->value[9*k+6]);
1547 			b[3]    = t*(D->value[9*k+2]*D->value[9*k+7] - D->value[9*k+1]*D->value[9*k+8]);
1548 			b[4]    = t*(D->value[9*k+0]*D->value[9*k+8] - D->value[9*k+2]*D->value[9*k+6]);
1549 			b[5]    = t*(D->value[9*k+1]*D->value[9*k+6] - D->value[9*k+0]*D->value[9*k+7]);
1550 			b[6]    = t*(D->value[9*k+1]*D->value[9*k+5] - D->value[9*k+2]*D->value[9*k+4]);
1551 			b[7]    = t*(D->value[9*k+2]*D->value[9*k+3] - D->value[9*k+0]*D->value[9*k+5]);
1552 			b[8]    = t*(D->value[9*k+0]*D->value[9*k+4] - D->value[9*k+1]*D->value[9*k+3]);
1553 			memcpy(&D->value[9*k],b,9*sizeof(LIS_SCALAR));
1554 /*			lis_printf(comm,"k=%d t=%e\n",k,t);
1555 			lis_printf(comm,"k=%d %e %e %e\n",k,b[0],b[3],b[6]);
1556 			lis_printf(comm,"k=%d %e %e %e\n",k,b[1],b[4],b[7]);
1557 			lis_printf(comm,"k=%d %e %e %e\n",k,b[2],b[5],b[8]);*/
1558 			break;
1559 		}
1560 		memcpy(tt,&D->value[bs*k],bs*sizeof(LIS_SCALAR));
1561 #endif
1562 #if 1
1563 		if( cz<cw )
1564 		{
1565 			for(j=0;j<cz;j++)
1566 			{
1567 				jj = iz[j];
1568 				if( wc[jj] )
1569 				{
1570 #if 1
1571 					for(i=0;i<bnr;i++)
1572 					{
1573 						wz[i] = -w[bs*jj+i]*lu[0];
1574 						for(ii=1;ii<bnr;ii++)
1575 						{
1576 							a = -w[bs*jj+ii*bnr+i];
1577 							for(kk=0;kk<ii-1;kk++)
1578 							{
1579 								a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1580 							}
1581 							wz[ii*bnr+i] = a * lu[ii*bnr+ii];
1582 						}
1583 					}
1584 					for(i=0;i<bnr;i++)
1585 					{
1586 						for(ii=bnr-1;ii>=0;ii--)
1587 						{
1588 							a = wz[ii*bnr+i];
1589 							for(kk=ii+1;kk<bnr;kk++)
1590 							{
1591 								a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1592 							}
1593 							wz[ii*bnr+i] = a;
1594 						}
1595 					}
1596 #endif
1597 					switch(bnr)
1598 					{
1599 					case 1:
1600 /*						D->value[jj] -= z[jj]*w[jj];*/
1601 /*						D->value[jj] -= z[jj]*w[jj]*tt[0];*/
1602 						D->value[jj] -= z[jj]*wz[0];
1603 						break;
1604 					case 2:
1605 /*						wz[0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1606 						wz[1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1607 						wz[2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1608 						wz[3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];*/
1609 						D->value[4*jj+0] -= z[4*jj+0]*wz[0] + z[4*jj+2]*wz[1];
1610 						D->value[4*jj+1] -= z[4*jj+1]*wz[0] + z[4*jj+3]*wz[1];
1611 						D->value[4*jj+2] -= z[4*jj+0]*wz[2] + z[4*jj+2]*wz[3];
1612 						D->value[4*jj+3] -= z[4*jj+1]*wz[2] + z[4*jj+3]*wz[3];
1613 
1614 /*						D->value[4*jj+0] -= w[4*jj+0]*z[4*jj+0] + w[4*jj+2]*z[4*jj+1];
1615 						D->value[4*jj+1] -= w[4*jj+1]*z[4*jj+0] + w[4*jj+3]*z[4*jj+1];
1616 						D->value[4*jj+2] -= w[4*jj+0]*z[4*jj+2] + w[4*jj+2]*z[4*jj+3];
1617 						D->value[4*jj+3] -= w[4*jj+1]*z[4*jj+2] + w[4*jj+3]*z[4*jj+3];*/
1618 						break;
1619 					case 3:
1620 /*						wz[0] = tt[0]*z[9*jj+0] + tt[3]*z[9*jj+1] + tt[6]*z[9*jj+2];
1621 						wz[1] = tt[1]*z[9*jj+0] + tt[4]*z[9*jj+1] + tt[7]*z[9*jj+2];
1622 						wz[2] = tt[2]*z[9*jj+0] + tt[5]*z[9*jj+1] + tt[8]*z[9*jj+2];
1623 						wz[3] = tt[0]*z[9*jj+3] + tt[3]*z[9*jj+4] + tt[6]*z[9*jj+5];
1624 						wz[4] = tt[1]*z[9*jj+3] + tt[4]*z[9*jj+4] + tt[7]*z[9*jj+5];
1625 						wz[5] = tt[2]*z[9*jj+3] + tt[5]*z[9*jj+4] + tt[8]*z[9*jj+5];
1626 						wz[6] = tt[0]*z[9*jj+6] + tt[3]*z[9*jj+7] + tt[6]*z[9*jj+8];
1627 						wz[7] = tt[1]*z[9*jj+6] + tt[4]*z[9*jj+7] + tt[7]*z[9*jj+8];
1628 						wz[8] = tt[2]*z[9*jj+6] + tt[5]*z[9*jj+7] + tt[8]*z[9*jj+8];
1629 */
1630 /*						wz[0] = z[9*jj+0];
1631 						wz[1] = z[9*jj+1];
1632 						wz[2] = z[9*jj+2];
1633 						wz[3] = z[9*jj+3];
1634 						wz[4] = z[9*jj+4];
1635 						wz[5] = z[9*jj+5];
1636 						wz[6] = z[9*jj+6];
1637 						wz[7] = z[9*jj+7];
1638 						wz[8] = z[9*jj+8];*/
1639 
1640 						D->value[9*jj+0] -= w[9*jj+0]*wz[0] + w[9*jj+3]*wz[1] + w[9*jj+6]*wz[2];
1641 						D->value[9*jj+1] -= w[9*jj+1]*wz[0] + w[9*jj+4]*wz[1] + w[9*jj+7]*wz[2];
1642 						D->value[9*jj+2] -= w[9*jj+2]*wz[0] + w[9*jj+5]*wz[1] + w[9*jj+8]*wz[2];
1643 						D->value[9*jj+3] -= w[9*jj+0]*wz[3] + w[9*jj+3]*wz[4] + w[9*jj+6]*wz[5];
1644 						D->value[9*jj+4] -= w[9*jj+1]*wz[3] + w[9*jj+4]*wz[4] + w[9*jj+7]*wz[5];
1645 						D->value[9*jj+5] -= w[9*jj+2]*wz[3] + w[9*jj+5]*wz[4] + w[9*jj+8]*wz[5];
1646 						D->value[9*jj+6] -= w[9*jj+0]*wz[6] + w[9*jj+3]*wz[7] + w[9*jj+6]*wz[8];
1647 						D->value[9*jj+7] -= w[9*jj+1]*wz[6] + w[9*jj+4]*wz[7] + w[9*jj+7]*wz[8];
1648 						D->value[9*jj+8] -= w[9*jj+2]*wz[6] + w[9*jj+5]*wz[7] + w[9*jj+8]*wz[8];
1649 						break;
1650 					}
1651 				}
1652 			}
1653 		}
1654 		else
1655 		{
1656 			for(j=0;j<cw;j++)
1657 			{
1658 				jj = iw[j];
1659 				if( zc[jj] )
1660 				{
1661 #if 1
1662 					for(i=0;i<bnr;i++)
1663 					{
1664 						wz[i] = -w[bs*jj+i]*lu[0];
1665 						for(ii=1;ii<bnr;ii++)
1666 						{
1667 							a = -w[bs*jj+ii*bnr+i];
1668 							for(kk=0;kk<ii-1;kk++)
1669 							{
1670 								a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1671 							}
1672 							wz[ii*bnr+i] = a * lu[ii*bnr+ii];
1673 						}
1674 					}
1675 					for(i=0;i<bnr;i++)
1676 					{
1677 						for(ii=bnr-1;ii>=0;ii--)
1678 						{
1679 							a = wz[ii*bnr+i];
1680 							for(kk=ii+1;kk<bnr;kk++)
1681 							{
1682 								a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1683 							}
1684 							wz[ii*bnr+i] = a;
1685 						}
1686 					}
1687 #endif
1688 					switch(bnr)
1689 					{
1690 					case 1:
1691 /*						D->value[jj] -= z[jj]*w[jj];
1692 						D->value[jj] -= z[jj]*w[jj]*tt[0];*/
1693 						D->value[jj] -= z[jj]*wz[0];
1694 						break;
1695 					case 2:
1696 /*						wz[0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1697 						wz[1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1698 						wz[2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1699 						wz[3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];
1700 						*/
1701 						D->value[4*jj+0] -= z[4*jj+0]*wz[0] + z[4*jj+2]*wz[1];
1702 						D->value[4*jj+1] -= z[4*jj+1]*wz[0] + z[4*jj+3]*wz[1];
1703 						D->value[4*jj+2] -= z[4*jj+0]*wz[2] + z[4*jj+2]*wz[3];
1704 						D->value[4*jj+3] -= z[4*jj+1]*wz[2] + z[4*jj+3]*wz[3];
1705 
1706 /*						D->value[4*jj+0] -= w[4*jj+0]*z[4*jj+0] + w[4*jj+2]*z[4*jj+1];
1707 						D->value[4*jj+1] -= w[4*jj+1]*z[4*jj+0] + w[4*jj+3]*z[4*jj+1];
1708 						D->value[4*jj+2] -= w[4*jj+0]*z[4*jj+2] + w[4*jj+2]*z[4*jj+3];
1709 						D->value[4*jj+3] -= w[4*jj+1]*z[4*jj+2] + w[4*jj+3]*z[4*jj+3];*/
1710 						break;
1711 					case 3:
1712 /*						wz[0] = tt[0]*z[9*jj+0] + tt[3]*z[9*jj+1] + tt[6]*z[9*jj+2];
1713 						wz[1] = tt[1]*z[9*jj+0] + tt[4]*z[9*jj+1] + tt[7]*z[9*jj+2];
1714 						wz[2] = tt[2]*z[9*jj+0] + tt[5]*z[9*jj+1] + tt[8]*z[9*jj+2];
1715 						wz[3] = tt[0]*z[9*jj+3] + tt[3]*z[9*jj+4] + tt[6]*z[9*jj+5];
1716 						wz[4] = tt[1]*z[9*jj+3] + tt[4]*z[9*jj+4] + tt[7]*z[9*jj+5];
1717 						wz[5] = tt[2]*z[9*jj+3] + tt[5]*z[9*jj+4] + tt[8]*z[9*jj+5];
1718 						wz[6] = tt[0]*z[9*jj+6] + tt[3]*z[9*jj+7] + tt[6]*z[9*jj+8];
1719 						wz[7] = tt[1]*z[9*jj+6] + tt[4]*z[9*jj+7] + tt[7]*z[9*jj+8];
1720 						wz[8] = tt[2]*z[9*jj+6] + tt[5]*z[9*jj+7] + tt[8]*z[9*jj+8];
1721 */
1722 /*						wz[0] = z[9*jj+0];
1723 						wz[1] = z[9*jj+1];
1724 						wz[2] = z[9*jj+2];
1725 						wz[3] = z[9*jj+3];
1726 						wz[4] = z[9*jj+4];
1727 						wz[5] = z[9*jj+5];
1728 						wz[6] = z[9*jj+6];
1729 						wz[7] = z[9*jj+7];
1730 						wz[8] = z[9*jj+8];*/
1731 
1732 						D->value[9*jj+0] -= w[9*jj+0]*wz[0] + w[9*jj+3]*wz[1] + w[9*jj+6]*wz[2];
1733 						D->value[9*jj+1] -= w[9*jj+1]*wz[0] + w[9*jj+4]*wz[1] + w[9*jj+7]*wz[2];
1734 						D->value[9*jj+2] -= w[9*jj+2]*wz[0] + w[9*jj+5]*wz[1] + w[9*jj+8]*wz[2];
1735 						D->value[9*jj+3] -= w[9*jj+0]*wz[3] + w[9*jj+3]*wz[4] + w[9*jj+6]*wz[5];
1736 						D->value[9*jj+4] -= w[9*jj+1]*wz[3] + w[9*jj+4]*wz[4] + w[9*jj+7]*wz[5];
1737 						D->value[9*jj+5] -= w[9*jj+2]*wz[3] + w[9*jj+5]*wz[4] + w[9*jj+8]*wz[5];
1738 						D->value[9*jj+6] -= w[9*jj+0]*wz[6] + w[9*jj+3]*wz[7] + w[9*jj+6]*wz[8];
1739 						D->value[9*jj+7] -= w[9*jj+1]*wz[6] + w[9*jj+4]*wz[7] + w[9*jj+7]*wz[8];
1740 						D->value[9*jj+8] -= w[9*jj+2]*wz[6] + w[9*jj+5]*wz[7] + w[9*jj+8]*wz[8];
1741 						break;
1742 					}
1743 				}
1744 			}
1745 		}
1746 #endif
1747 		/* drop U */
1748 		nnz = 0;
1749 		for(j=0;j<cz;j++)
1750 		{
1751 			jj = iz[j];
1752 			switch(bnr)
1753 			{
1754 			case 1:
1755 				t       = fabs(z[jj]);
1756 				break;
1757 			case 2:
1758 				t       = fabs(z[4*jj+0]) + fabs(z[4*jj+1]);
1759 				t      += fabs(z[4*jj+2]) + fabs(z[4*jj+3]);
1760 /*				t       = sqrt(t);*/
1761 				break;
1762 			case 3:
1763 				t       = fabs(z[9*jj+0]) + fabs(z[9*jj+1]) + fabs(z[9*jj+2]);
1764 				t      += fabs(z[9*jj+3]) + fabs(z[9*jj+4]) + fabs(z[9*jj+5]);
1765 				t      += fabs(z[9*jj+6]) + fabs(z[9*jj+7]) + fabs(z[9*jj+8]);
1766 /*				t       = sqrt(t);*/
1767 				break;
1768 			}
1769 			if( t>toldd )
1770 			{
1771 				iz[nnz++]    = jj;
1772 				tmp[jj] = t;
1773 			}
1774 			else
1775 			{
1776 				switch(bnr)
1777 				{
1778 				case 1:
1779 					z[k] = z[k] + fabs(z[jj]);
1780 					break;
1781 				case 2:
1782 					z[4*k+0] += fabs(z[4*jj+0]);
1783 					z[4*k+1] += fabs(z[4*jj+1]);
1784 					z[4*k+1] += fabs(z[4*jj+2]);
1785 					z[4*k+1] += fabs(z[4*jj+3]);
1786 					break;
1787 				case 3:
1788 					z[9*k+0] += fabs(z[9*jj+0]);
1789 					z[9*k+1] += fabs(z[9*jj+1]);
1790 					z[9*k+2] += fabs(z[9*jj+2]);
1791 					z[9*k+3] += fabs(z[9*jj+3]);
1792 					z[9*k+4] += fabs(z[9*jj+4]);
1793 					z[9*k+5] += fabs(z[9*jj+5]);
1794 					z[9*k+6] += fabs(z[9*jj+6]);
1795 					z[9*k+7] += fabs(z[9*jj+7]);
1796 					z[9*k+8] += fabs(z[9*jj+8]);
1797 					break;
1798 				}
1799 				zc[jj] = 0;
1800 				tmp[jj] = 0.0;
1801 			}
1802 		}
1803 		len = _min(lfil,nnz);
1804 		lis_sort_di(0,nnz-1,tmp,iz);
1805 		lis_sort_i(0,len-1,iz);
1806 		U->nnz[k] = len;
1807 		if( len>0 )
1808 		{
1809 			U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
1810 			U->value[k] = (LIS_SCALAR *)malloc(bs*len*sizeof(LIS_SCALAR));
1811 		}
1812 		for(j=0;j<len;j++)
1813 		{
1814 			jj = iz[j];
1815 			U->index[k][j] = jj;
1816 /*			U->value[k][j] = z[jj];*/
1817 			memcpy(&U->value[k][bs*j],&z[bs*jj],bs*sizeof(LIS_SCALAR));
1818 /*			for(i=0;i<bnr;i++)
1819 			{
1820 				lis_printf(comm,"U:k=%d j=%d",k,jj);
1821 				for(ii=0;ii<bnr;ii++)
1822 				{
1823 					lis_printf(comm," %e",z[bs*jj+ii*bnr+i]);
1824 				}
1825 				lis_printf(comm,"\n");
1826 			}*/
1827 		}
1828 		for(j=len;j<cz;j++) zc[iz[j]] = 0;
1829 		cz = nnz;
1830 /*		lis_printf(comm,"k=%d U:lfil=%d nnz=%d len=%d\n",k,lfil,nnz,len);*/
1831 
1832 		/* drop L */
1833 		nnz = 0;
1834 		for(j=0;j<cw;j++)
1835 		{
1836 			jj = iw[j];
1837 			switch(bnr)
1838 			{
1839 			case 1:
1840 				t       = fabs(w[jj]);
1841 				break;
1842 			case 2:
1843 				t       = fabs(w[4*jj+0]) + fabs(w[4*jj+1]);
1844 				t      += fabs(w[4*jj+2]) + fabs(w[4*jj+3]);
1845 /*				t       = sqrt(t);*/
1846 				break;
1847 			case 3:
1848 				t       = fabs(w[9*jj+0]) + fabs(w[9*jj+1]) + fabs(w[9*jj+2]);
1849 				t      += fabs(w[9*jj+3]) + fabs(w[9*jj+4]) + fabs(w[9*jj+5]);
1850 				t      += fabs(w[9*jj+6]) + fabs(w[9*jj+7]) + fabs(w[9*jj+8]);
1851 /*				t       = sqrt(t);*/
1852 				break;
1853 			}
1854 /*			lis_printf(comm,"k=%d L:jj=%d t=%e tol=%e\n",k,jj,t,toldd);*/
1855 			if( t>toldd )
1856 			{
1857 				iw[nnz++]    = jj;
1858 				tmp[jj] = t;
1859 			}
1860 			else
1861 			{
1862 				wc[jj] = 0;
1863 				tmp[jj] = 0.0;
1864 			}
1865 		}
1866 		len = _min(lfil,nnz);
1867 		lis_sort_di(0,nnz-1,tmp,iw);
1868 		lis_sort_i(0,len-1,iw);
1869 		L->nnz[k] = len;
1870 		if( len>0 )
1871 		{
1872 			L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
1873 			L->value[k] = (LIS_SCALAR *)malloc(bs*len*sizeof(LIS_SCALAR));
1874 		}
1875 		for(j=0;j<len;j++)
1876 		{
1877 			jj = iw[j];
1878 			L->index[k][j] = jj;
1879 
1880 #if 1
1881 			for(i=0;i<bnr;i++)
1882 			{
1883 				L->value[k][bs*j+i] = -w[bs*jj+i]*lu[0];
1884 				for(ii=1;ii<bnr;ii++)
1885 				{
1886 					a = -w[bs*jj+ii*bnr+i];
1887 					for(kk=0;kk<ii-1;kk++)
1888 					{
1889 						a -= L->value[k][bs*j+kk*bnr+i] * lu[ii*bnr+kk];
1890 					}
1891 					L->value[k][bs*j+ii*bnr+i] = a * lu[ii*bnr+ii];
1892 				}
1893 			}
1894 			for(i=0;i<bnr;i++)
1895 			{
1896 				for(ii=bnr-1;ii>=0;ii--)
1897 				{
1898 					a = L->value[k][bs*j+ii*bnr+i];
1899 					for(kk=ii+1;kk<bnr;kk++)
1900 					{
1901 						a -= L->value[k][bs*j+kk*bnr+i] * lu[ii*bnr+kk];
1902 					}
1903 					L->value[k][bs*j+ii*bnr+i] = a;
1904 				}
1905 			}
1906 #endif
1907 #if 0
1908 			switch(bnr)
1909 			{
1910 			case 1:
1911 				L->value[k][j] = w[jj]*tt[0];
1912 				break;
1913 			case 2:
1914 				L->value[k][4*j+0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1915 				L->value[k][4*j+1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1916 				L->value[k][4*j+2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1917 				L->value[k][4*j+3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];
1918 				break;
1919 			case 3:
1920 				L->value[k][9*j+0] = w[9*jj+0]*tt[0] + w[9*jj+3]*tt[1] + w[9*jj+6]*tt[2];
1921 				L->value[k][9*j+1] = w[9*jj+1]*tt[0] + w[9*jj+4]*tt[1] + w[9*jj+7]*tt[2];
1922 				L->value[k][9*j+2] = w[9*jj+2]*tt[0] + w[9*jj+5]*tt[1] + w[9*jj+8]*tt[2];
1923 				L->value[k][9*j+3] = w[9*jj+0]*tt[3] + w[9*jj+3]*tt[4] + w[9*jj+6]*tt[5];
1924 				L->value[k][9*j+4] = w[9*jj+1]*tt[3] + w[9*jj+4]*tt[4] + w[9*jj+7]*tt[5];
1925 				L->value[k][9*j+5] = w[9*jj+2]*tt[3] + w[9*jj+5]*tt[4] + w[9*jj+8]*tt[5];
1926 				L->value[k][9*j+6] = w[9*jj+0]*tt[6] + w[9*jj+3]*tt[7] + w[9*jj+6]*tt[8];
1927 				L->value[k][9*j+7] = w[9*jj+1]*tt[6] + w[9*jj+4]*tt[7] + w[9*jj+7]*tt[8];
1928 				L->value[k][9*j+8] = w[9*jj+2]*tt[6] + w[9*jj+5]*tt[7] + w[9*jj+8]*tt[8];
1929 				break;
1930 			}
1931 /*			for(i=0;i<bnr;i++)
1932 			{
1933 				lis_printf(comm,"L:k=%d j=%d",k,jj);
1934 				for(ii=0;ii<bnr;ii++)
1935 				{
1936 					lis_printf(comm," %e",L->value[k][bs*j+ii*bnr+i]);
1937 				}
1938 				lis_printf(comm,"\n");
1939 			}*/
1940 #endif
1941 		}
1942 		for(j=len;j<cw;j++) wc[iw[j]] = 0;
1943 		cw = nnz;
1944 /*		lis_printf(comm,"k=%d L:lfil=%d nnz=%d len=%d\n",k,lfil,nnz,len);*/
1945 
1946 
1947 		/**/
1948 		for(j=0;j<cz;j++) zc[iz[j]] = 0;
1949 		for(j=0;j<cw;j++) wc[iw[j]] = 0;
1950 		if(U->nnz[k]>0)
1951 		{
1952 			jj     = iz[0];
1953 			zc[k]  = 0;
1954 			zl[k]  = zl[jj];
1955 			zl[jj] = k;
1956 		}
1957 		if(L->nnz[k]>0)
1958 		{
1959 			jj     = iw[0];
1960 			wc[k]  = 0;
1961 			wl[k]  = wl[jj];
1962 			wl[jj] = k;
1963 		}
1964 	}
1965 
1966 	precon->L  = L;
1967 	precon->U  = U;
1968 	precon->WD  = D;
1969 
1970 	lis_free2(12,tmp,w,z,iw,wc,wl,iz,zc,zl,ptr,index,value);
1971 
1972 	LIS_DEBUG_FUNC_OUT;
1973 	return LIS_SUCCESS;
1974 }
1975 
1976 
1977 
1978 #undef __FUNC__
1979 #define __FUNC__ "lis_psolve_iluc"
lis_psolve_iluc(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)1980 LIS_INT lis_psolve_iluc(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
1981 {
1982 #ifdef _OPENMP
1983 	LIS_INT i,j,jj,n;
1984 	LIS_INT nprocs,my_rank,is,ie;
1985 	LIS_SCALAR w;
1986 	LIS_SCALAR *x;
1987 	LIS_MATRIX_ILU L,U;
1988 	LIS_VECTOR D;
1989 	LIS_PRECON precon;
1990 	LIS_QUAD_DECLAR;
1991 	#ifdef USE_QUAD_PRECISION
1992 		LIS_QUAD w1;
1993 		LIS_SCALAR *xl;
1994 	#endif
1995 
1996 
1997 	LIS_DEBUG_FUNC_IN;
1998 
1999 	precon = solver->precon;
2000 	L = precon->L;
2001 	U = precon->U;
2002 	D = precon->D;
2003 	n = L->n;
2004 	x = X->value;
2005 	#ifdef USE_QUAD_PRECISION
2006 		xl = X->value_lo;
2007 	#endif
2008 
2009 	#ifdef USE_QUAD_PRECISION
2010 		if( B->precision==LIS_PRECISION_DEFAULT )
2011 		{
2012 	#endif
2013 			lis_vector_copy(B,X);
2014 			nprocs = omp_get_max_threads();
2015 			#pragma omp parallel private(i,j,jj,is,ie,my_rank,w)
2016 			{
2017 				my_rank = omp_get_thread_num();
2018 				LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2019 
2020 				for(i=is; i<ie; i++)
2021 				{
2022 					for(j=0;j<L->nnz[i];j++)
2023 					{
2024 						jj     = L->index[i][j];
2025 						x[jj] -= L->value[i][j] * x[i];
2026 					}
2027 				}
2028 				for(i=ie-1; i>=is; i--)
2029 				{
2030 					w = x[i];
2031 					for(j=0;j<U->nnz[i];j++)
2032 					{
2033 						jj = U->index[i][j];
2034 						w -= U->value[i][j] * x[jj];
2035 					}
2036 					x[i] = w * D->value[i];
2037 				}
2038 			}
2039 	#ifdef USE_QUAD_PRECISION
2040 		}
2041 		else
2042 		{
2043 			lis_vector_copyex_mm(B,X);
2044 			nprocs = omp_get_max_threads();
2045 			#ifndef USE_SSE2
2046 				#pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
2047 			#else
2048 				#pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
2049 			#endif
2050 			{
2051 				my_rank = omp_get_thread_num();
2052 				LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2053 				for(i=is; i<ie; i++)
2054 				{
2055 					for(j=0;j<L->nnz[i];j++)
2056 					{
2057 						jj     = L->index[i][j];
2058 /*						if( jj>=is && jj<ie )*/
2059 						{
2060 							#ifndef USE_SSE2
2061 								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2062 							#else
2063 								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2064 							#endif
2065 						}
2066 					}
2067 				}
2068 				for(i=ie-1; i>=is; i--)
2069 				{
2070 					w1.hi = x[i];
2071 					w1.lo = xl[i];
2072 					for(j=0;j<U->nnz[i];j++)
2073 					{
2074 						jj = U->index[i][j];
2075 						#ifndef USE_SSE2
2076 							LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2077 						#else
2078 							LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2079 						#endif
2080 					}
2081 					#ifndef USE_SSE2
2082 						LIS_QUAD_MULD(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2083 					#else
2084 						LIS_QUAD_MULD_SSE2(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2085 					#endif
2086 				}
2087 			}
2088 		}
2089 	#endif
2090 
2091 	LIS_DEBUG_FUNC_OUT;
2092 	return LIS_SUCCESS;
2093 
2094 #else
2095 	LIS_INT i,j,jj,n;
2096 	LIS_SCALAR w;
2097 	LIS_SCALAR *b,*x;
2098 	LIS_MATRIX_ILU L,U;
2099 	LIS_VECTOR D;
2100 	LIS_PRECON precon;
2101 	LIS_QUAD_DECLAR;
2102 	#ifdef USE_QUAD_PRECISION
2103 		LIS_QUAD w1;
2104 		LIS_SCALAR *xl;
2105 	#endif
2106 
2107 	/*
2108 	 *  LUx = b
2109 	 *  LU  = (D + L*A) * (I + D^-1 * U*A)
2110 	 */
2111 
2112 	LIS_DEBUG_FUNC_IN;
2113 
2114 	precon = solver->precon;
2115 	L = precon->L;
2116 	U = precon->U;
2117 	D = precon->D;
2118 	n = L->n;
2119 	b = B->value;
2120 	x = X->value;
2121 	#ifdef USE_QUAD_PRECISION
2122 		xl = X->value_lo;
2123 	#endif
2124 
2125 	#ifdef USE_QUAD_PRECISION
2126 		if( B->precision==LIS_PRECISION_DEFAULT )
2127 		{
2128 	#endif
2129 			lis_vector_copy(B,X);
2130 			for(i=0; i<n; i++)
2131 			{
2132 				for(j=0;j<L->nnz[i];j++)
2133 				{
2134 					jj     = L->index[i][j];
2135 					x[jj] -= L->value[i][j] * x[i];
2136 				}
2137 /*				x[i] = x[i] * L->work[i];*/
2138 			}
2139 			for(i=n-1; i>=0; i--)
2140 			{
2141 				w = x[i];
2142 				for(j=0;j<U->nnz[i];j++)
2143 				{
2144 					jj = U->index[i][j];
2145 					w -= U->value[i][j] * x[jj];
2146 				}
2147 				x[i] = w * D->value[i];
2148 			}
2149 	#ifdef USE_QUAD_PRECISION
2150 		}
2151 		else
2152 		{
2153 			lis_vector_copyex_mm(B,X);
2154 			for(i=0; i<n; i++)
2155 			{
2156 				for(j=0;j<L->nnz[i];j++)
2157 				{
2158 					jj     = L->index[i][j];
2159 					#ifndef USE_SSE2
2160 						LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2161 					#else
2162 						LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2163 					#endif
2164 					/* x[jj] -= L->value[i][j] * x[i]; */
2165 				}
2166 			}
2167 			for(i=n-1; i>=0; i--)
2168 			{
2169 				w1.hi = x[i];
2170 				w1.lo = xl[i];
2171 				/* t = x[i]; */
2172 				for(j=0;j<U->nnz[i];j++)
2173 				{
2174 					jj = U->index[i][j];
2175 					#ifndef USE_SSE2
2176 						LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2177 					#else
2178 						LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2179 					#endif
2180 					/* t -= U->value[i][j] * x[jj]; */
2181 				}
2182 				#ifndef USE_SSE2
2183 					LIS_QUAD_MULD(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2184 				#else
2185 					LIS_QUAD_MULD_SSE2(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2186 				#endif
2187 				/* x[i] = t * D->value[i]; */
2188 			}
2189 		}
2190 	#endif
2191 
2192 	LIS_DEBUG_FUNC_OUT;
2193 	return LIS_SUCCESS;
2194 
2195 #endif
2196 }
2197 
2198 #undef __FUNC__
2199 #define __FUNC__ "lis_psolve_iluc_bsr"
lis_psolve_iluc_bsr(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)2200 LIS_INT lis_psolve_iluc_bsr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
2201 {
2202 	LIS_INT i,j,jj,nr,bnr,ii,bs;
2203 	LIS_SCALAR w[9],t;
2204 	LIS_SCALAR *x;
2205 	LIS_MATRIX_ILU L,U;
2206 	LIS_MATRIX_DIAG D;
2207 	LIS_PRECON precon;
2208 
2209 	/*
2210 	 *  LUx = b
2211 	 *  LU  = (D + L*A) * (I + D^-1 * U*A)
2212 	 */
2213 
2214 	LIS_DEBUG_FUNC_IN;
2215 
2216 	precon = solver->precon;
2217 	L = precon->L;
2218 	U = precon->U;
2219 	D = precon->WD;
2220 	x = X->value;
2221 	nr = solver->A->nr;
2222 	bnr = solver->A->bnr;
2223 	bs  = bnr*bnr;
2224 
2225 	lis_vector_copy(B,X);
2226 	for(i=0; i<nr; i++)
2227 	{
2228 		for(j=0;j<L->nnz[i];j++)
2229 		{
2230 			jj     = L->index[i][j];
2231 /*			memcpy(w,&x[bnr*i],bnr*sizeof(LIS_SCALAR));*/
2232 			switch(bnr)
2233 			{
2234 			case 1:
2235 				x[jj] -= L->value[i][j] * x[i];
2236 				break;
2237 			case 2:
2238 				x[2*jj+0] -= L->value[i][4*j]   * x[2*i+0];
2239 				x[2*jj+0] -= L->value[i][4*j+2] * x[2*i+1];
2240 				x[2*jj+1] -= L->value[i][4*j+1] * x[2*i+0];
2241 				x[2*jj+1] -= L->value[i][4*j+3] * x[2*i+1];
2242 				break;
2243 			case 3:
2244 				x[3*jj]   -= L->value[i][9*j]   * x[3*i+0] + L->value[i][9*j+3] * x[3*i+1] + L->value[i][9*j+6] * x[3*i+2];
2245 				x[3*jj+1] -= L->value[i][9*j+1] * x[3*i+0] + L->value[i][9*j+4] * x[3*i+1] + L->value[i][9*j+7] * x[3*i+2];
2246 				x[3*jj+2] -= L->value[i][9*j+2] * x[3*i+0] + L->value[i][9*j+5] * x[3*i+1] + L->value[i][9*j+8] * x[3*i+2];
2247 				break;
2248 			}
2249 		}
2250 	}
2251 	for(i=nr-1; i>=0; i--)
2252 	{
2253 /*		w = x[i];*/
2254 		memcpy(w,&x[bnr*i],bnr*sizeof(LIS_SCALAR));
2255 		for(j=0;j<U->nnz[i];j++)
2256 		{
2257 			jj = U->index[i][j];
2258 			switch(bnr)
2259 			{
2260 			case 1:
2261 				w[0] -= U->value[i][j] * x[jj];
2262 				break;
2263 			case 2:
2264 				w[0] -= U->value[i][4*j]   * x[2*jj];
2265 				w[0] -= U->value[i][4*j+2] * x[2*jj+1];
2266 				w[1] -= U->value[i][4*j+1] * x[2*jj];
2267 				w[1] -= U->value[i][4*j+3] * x[2*jj+1];
2268 				break;
2269 			case 3:
2270 				w[0] -= U->value[i][9*j]   * x[3*jj] + U->value[i][9*j+3] * x[3*jj+1] + U->value[i][9*j+6] * x[3*jj+2];
2271 				w[1] -= U->value[i][9*j+1] * x[3*jj] + U->value[i][9*j+4] * x[3*jj+1] + U->value[i][9*j+7] * x[3*jj+2];
2272 				w[2] -= U->value[i][9*j+2] * x[3*jj] + U->value[i][9*j+5] * x[3*jj+1] + U->value[i][9*j+8] * x[3*jj+2];
2273 				break;
2274 			}
2275 		}
2276 #if 0
2277 		switch(bnr)
2278 		{
2279 		case 1:
2280 			x[i] = w[0] * D->value[i];
2281 			break;
2282 		case 2:
2283 			x[2*i+0] = D->value[4*i]   * w[0] + D->value[4*i+2] * w[1];
2284 			x[2*i+1] = D->value[4*i+1] * w[0] + D->value[4*i+3] * w[1];
2285 			break;
2286 		case 3:
2287 			x[3*i+0] = D->value[9*i]   * w[0] + D->value[9*i+3] * w[1] + D->value[9*i+6] * w[2];
2288 			x[3*i+1] = D->value[9*i+1] * w[0] + D->value[9*i+4] * w[1] + D->value[9*i+7] * w[2];
2289 			x[3*i+2] = D->value[9*i+2] * w[0] + D->value[9*i+5] * w[1] + D->value[9*i+8] * w[2];
2290 			break;
2291 		}
2292 #else
2293 		for(ii=0;ii<bnr;ii++)
2294 		{
2295 			t = w[ii];
2296 			for(jj=0;jj<ii;jj++)
2297 			{
2298 				t -= D->value[bs*i+jj*bnr+ii] * x[bnr*i+jj];
2299 			}
2300 			x[bnr*i+ii] = t;
2301 		}
2302 		for(ii=bnr-1;ii>=0;ii--)
2303 		{
2304 			t = x[bnr*i+ii];
2305 			for(jj=ii+1;jj<bnr;jj++)
2306 			{
2307 				t -= D->value[bs*i+jj*bnr+ii] * x[bnr*i+jj];
2308 			}
2309 			x[bnr*i+ii] = t * D->value[bs*i+ii*bnr+ii];
2310 		}
2311 #endif
2312 	}
2313 
2314 	LIS_DEBUG_FUNC_OUT;
2315 	return LIS_SUCCESS;
2316 }
2317 
2318 #undef __FUNC__
2319 #define __FUNC__ "lis_psolveh_iluc"
lis_psolveh_iluc(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)2320 LIS_INT lis_psolveh_iluc(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
2321 {
2322 #ifdef _OPENMP
2323 	LIS_INT i,j,jj,n,is,ie,my_rank,nprocs;
2324 	LIS_SCALAR w;
2325 	LIS_SCALAR *x;
2326 	LIS_MATRIX_ILU L,U;
2327 	LIS_VECTOR D;
2328 	LIS_PRECON precon;
2329 	LIS_QUAD_DECLAR;
2330 	#ifdef USE_QUAD_PRECISION
2331 		LIS_QUAD w1;
2332 		LIS_SCALAR *xl;
2333 	#endif
2334 
2335 	/*
2336 	 *  (LU)'x = b
2337 	 *  U'x=b
2338 	 *  L'x=b
2339 	 */
2340 
2341 
2342 	LIS_DEBUG_FUNC_IN;
2343 
2344 	precon = solver->precon;
2345 	L = precon->L;
2346 	U = precon->U;
2347 	D = precon->D;
2348 	n = L->n;
2349 	x = X->value;
2350 	#ifdef USE_QUAD_PRECISION
2351 		xl = X->value_lo;
2352 	#endif
2353 
2354 	#ifdef USE_QUAD_PRECISION
2355 		if( B->precision==LIS_PRECISION_DEFAULT )
2356 		{
2357 	#endif
2358 			lis_vector_copy(B,X);
2359 			nprocs = omp_get_max_threads();
2360 			#pragma omp parallel private(i,j,jj,is,ie,my_rank,w)
2361 			{
2362 				my_rank = omp_get_thread_num();
2363 				LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2364 				for(i=is; i<ie; i++)
2365 				{
2366 					x[i] = x[i] * D->value[i];
2367 					for(j=0;j<U->nnz[i];j++)
2368 					{
2369 						jj     = U->index[i][j];
2370 						x[jj] -= conj(U->value[i][j]) * x[i];
2371 					}
2372 				}
2373 				for(i=ie-2; i>=is; i--)
2374 				{
2375 					w = x[i];
2376 					for(j=0;j<L->nnz[i];j++)
2377 					{
2378 						jj  = L->index[i][j];
2379 						w  -= conj(L->value[i][j]) * x[jj];
2380 					}
2381 					x[i] = w;
2382 				}
2383 			}
2384 	#ifdef USE_QUAD_PRECISION
2385 		}
2386 		else
2387 		{
2388 			lis_vector_copyex_mm(B,X);
2389 			nprocs = omp_get_max_threads();
2390 			#ifndef USE_SSE2
2391 				#pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
2392 			#else
2393 				#pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
2394 			#endif
2395 			{
2396 				my_rank = omp_get_thread_num();
2397 				LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2398 				for(i=is; i<ie; i++)
2399 				{
2400 					#ifndef USE_SSE2
2401 						LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
2402 					#else
2403 						LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],conj(D->value[i]));
2404 					#endif
2405 					/* x[i] = x[i] * conj(D->value[i]); */
2406 					for(j=0;j<U->nnz[i];j++)
2407 					{
2408 						jj     = U->index[i][j];
2409 						#ifndef USE_SSE2
2410 							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2411 						#else
2412 							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2413 						#endif
2414 						/* x[jj] -= conj(U->value[i][j]) * x[i]; */
2415 					}
2416 				}
2417 				for(i=ie-2; i>=is; i--)
2418 				{
2419 					w1.hi = x[i];
2420 					w1.lo = xl[i];
2421 					/* t = x[i]; */
2422 					for(j=0;j<L->nnz[i];j++)
2423 					{
2424 						jj  = L->index[i][j];
2425 						#ifndef USE_SSE2
2426 							LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2427 						#else
2428 							LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2429 						#endif
2430 						/* t  -= conj(L->value[i][j]) * x[jj]; */
2431 					}
2432 					x[i]  = w1.hi;
2433 					xl[i] = w1.lo;
2434 					/* x[i] = t; */
2435 				}
2436 			}
2437 		}
2438 	#endif
2439 
2440 	LIS_DEBUG_FUNC_OUT;
2441 	return LIS_SUCCESS;
2442 #else
2443 	LIS_INT i,j,jj,n;
2444 	LIS_SCALAR w;
2445 	LIS_SCALAR *b,*x;
2446 	LIS_MATRIX_ILU L,U;
2447 	LIS_VECTOR D;
2448 	LIS_PRECON precon;
2449 	LIS_QUAD_DECLAR;
2450 	#ifdef USE_QUAD_PRECISION
2451 		LIS_QUAD w1;
2452 		LIS_SCALAR *xl;
2453 	#endif
2454 
2455 	/*
2456 	 *  (LU)'x = b
2457 	 *  U'x=b
2458 	 *  L'x=b
2459 	 */
2460 
2461 
2462 	LIS_DEBUG_FUNC_IN;
2463 
2464 	precon = solver->precon;
2465 	L = precon->L;
2466 	U = precon->U;
2467 	D = precon->D;
2468 	n = L->n;
2469 	b = B->value;
2470 	x = X->value;
2471 	#ifdef USE_QUAD_PRECISION
2472 		xl = X->value_lo;
2473 	#endif
2474 
2475 	#ifdef USE_QUAD_PRECISION
2476 		if( B->precision==LIS_PRECISION_DEFAULT )
2477 		{
2478 	#endif
2479 			lis_vector_copy(B,X);
2480 			for(i=0; i<n; i++)
2481 			{
2482 				x[i] = x[i] * D->value[i];
2483 				for(j=0;j<U->nnz[i];j++)
2484 				{
2485 					jj     = U->index[i][j];
2486 					x[jj] -= conj(U->value[i][j]) * x[i];
2487 				}
2488 			}
2489 			for(i=n-2; i>=0; i--)
2490 			{
2491 				w = x[i];
2492 				for(j=0;j<L->nnz[i];j++)
2493 				{
2494 					jj  = L->index[i][j];
2495 					w  -= conj(L->value[i][j]) * x[jj];
2496 				}
2497 				x[i] = w;
2498 			}
2499 	#ifdef USE_QUAD_PRECISION
2500 		}
2501 		else
2502 		{
2503 			lis_vector_copyex_mm(B,X);
2504 			for(i=0; i<n; i++)
2505 			{
2506 				#ifndef USE_SSE2
2507 					LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
2508 				#else
2509 					LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],conj(D->value[i]));
2510 				#endif
2511 				/* x[i] = x[i] * conj(D->value[i]); */
2512 				for(j=0;j<U->nnz[i];j++)
2513 				{
2514 					jj     = U->index[i][j];
2515 					#ifndef USE_SSE2
2516 						LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2517 					#else
2518 						LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2519 					#endif
2520 					/* x[jj] -= conj(U->value[i][j]) * x[i]; */
2521 				}
2522 			}
2523 			for(i=n-2; i>=0; i--)
2524 			{
2525 				w1.hi = x[i];
2526 				w1.lo = xl[i];
2527 				/* t = x[i]; */
2528 				for(j=0;j<L->nnz[i];j++)
2529 				{
2530 					jj  = L->index[i][j];
2531 					#ifndef USE_SSE2
2532 						LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2533 					#else
2534 						LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2535 					#endif
2536 					/* t  -= conj(L->value[i][j]) * x[jj]; */
2537 				}
2538 				x[i]  = w1.hi;
2539 				xl[i] = w1.lo;
2540 				/* x[i] = t; */
2541 			}
2542 		}
2543 	#endif
2544 
2545 	LIS_DEBUG_FUNC_OUT;
2546 	return LIS_SUCCESS;
2547 #endif
2548 }
2549 
2550