1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions are met:
5 1. Redistributions of source code must retain the above copyright
6 notice, this list of conditions and the following disclaimer.
7 2. Redistributions in binary form must reproduce the above copyright
8 notice, this list of conditions and the following disclaimer in the
9 documentation and/or other materials provided with the distribution.
10 3. Neither the name of the project nor the names of its contributors
11 may be used to endorse or promote products derived from this software
12 without specific prior written permission.
13
14 THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18 PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 /*
28 * This subroutine is made based on ITSOL.
29 *
30 * http://www-users.cs.umn.edu/~saad/software/ITSOL/
31 *
32 */
33
34 #ifdef HAVE_CONFIG_H
35 #include "lis_config.h"
36 #else
37 #ifdef HAVE_CONFIG_WIN_H
38 #include "lis_config_win.h"
39 #endif
40 #endif
41
42 #include <stdio.h>
43 #include <stdlib.h>
44 #ifdef HAVE_MALLOC_H
45 #include <malloc.h>
46 #endif
47 #include <string.h>
48 #include <stdarg.h>
49 #include <math.h>
50 #ifdef _OPENMP
51 #include <omp.h>
52 #endif
53 #ifdef USE_MPI
54 #include <mpi.h>
55 #endif
56 #include "lislib.h"
57
58 /************************************************
59 * lis_precon_create
60 * lis_psolve
61 * lis_psolveh
62 ************************************************/
63
64 #define BSR_ENABLE 0
65 #undef __FUNC__
66 #define __FUNC__ "lis_precon_create_iluc"
lis_precon_create_iluc(LIS_SOLVER solver,LIS_PRECON precon)67 LIS_INT lis_precon_create_iluc(LIS_SOLVER solver, LIS_PRECON precon)
68 {
69 LIS_INT storage,block,err;
70 LIS_MATRIX A,B;
71
72 LIS_DEBUG_FUNC_IN;
73
74 #ifdef BSR_ENABLE
75 storage = solver->options[LIS_OPTIONS_STORAGE];
76 block = solver->options[LIS_OPTIONS_STORAGE_BLOCK];
77
78 if( storage==LIS_MATRIX_BSR )
79 {
80 if( solver->A->matrix_type!=storage )
81 {
82 err = lis_matrix_convert_self(solver);
83 if( err ) return err;
84 }
85 }
86 #endif
87
88 switch( solver->A->matrix_type )
89 {
90 case LIS_MATRIX_CSR:
91 err = lis_matrix_split(solver->A);
92 if( err )
93 {
94 return err;
95 }
96 err = lis_precon_create_iluc_csr(solver,precon);
97 break;
98 #ifdef BSR_ENABLE
99 case LIS_MATRIX_BSR:
100 err = lis_matrix_split(solver->A);
101 if( err )
102 {
103 return err;
104 }
105 err = lis_precon_create_iluc_bsr(solver,precon);
106 lis_psolve_xxx[LIS_PRECON_TYPE_ILUC] = lis_psolve_iluc_bsr;
107 break;
108 #endif
109 default:
110 A = solver->A;
111 err = lis_matrix_duplicate(A,&B);
112 if( err ) return err;
113 lis_matrix_set_type(B,LIS_MATRIX_CSR);
114 err = lis_matrix_convert(A,B);
115 if( err ) return err;
116 solver->A = B;
117 err = lis_matrix_split(solver->A);
118 if( err )
119 {
120 return err;
121 }
122 err = lis_precon_create_iluc_csr(solver,precon);
123 lis_matrix_destroy(B);
124 solver->A = A;
125 break;
126 }
127
128 LIS_DEBUG_FUNC_OUT;
129 return LIS_SUCCESS;
130 }
131
132
133 #undef __FUNC__
134 #define __FUNC__ "lis_precon_create_iluc_csr"
lis_precon_create_iluc_csr(LIS_SOLVER solver,LIS_PRECON precon)135 LIS_INT lis_precon_create_iluc_csr(LIS_SOLVER solver, LIS_PRECON precon)
136 {
137 #ifdef _OPENMP
138 LIS_INT err;
139 LIS_INT i,j,k,l,ii,jj,kk,ll;
140 LIS_INT is,ie,my_rank,nprocs;
141 LIS_INT n,nnz,lfil,len;
142 LIS_INT cz,cw;
143 LIS_INT *iw,*iw2,*wc,*wl,*iz,*zc,*zl;
144 LIS_INT *index,*ptr;
145 LIS_SCALAR val;
146 LIS_REAL t,tol,toldd;
147 LIS_SCALAR *z,*w,*tmp;
148 LIS_SCALAR *value;
149 LIS_SCALAR m;
150 LIS_MATRIX A;
151 LIS_MATRIX_ILU L,U;
152 LIS_VECTOR D;
153
154 LIS_DEBUG_FUNC_IN;
155
156
157 A = solver->A;
158 n = A->n;
159 tol = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
160 m = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
161 lfil = (LIS_INT)((double)A->nnz/(2.0*n))*m;
162 nprocs = omp_get_max_threads();
163
164 L = NULL;
165 U = NULL;
166
167
168 err = lis_matrix_ilu_create(n,1,&L);
169 if( err ) return err;
170 err = lis_matrix_ilu_create(n,1,&U);
171 if( err ) return err;
172 err = lis_matrix_ilu_setCR(L);
173 if( err ) return err;
174 err = lis_matrix_ilu_setCR(U);
175 if( err ) return err;
176 err = lis_vector_duplicate(A,&D);
177 if( err )
178 {
179 return err;
180 }
181
182 tmp = (LIS_SCALAR *)lis_malloc(nprocs*n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
183 if( tmp==NULL )
184 {
185 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
186 return LIS_OUT_OF_MEMORY;
187 }
188 w = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
189 if( w==NULL )
190 {
191 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
192 return LIS_OUT_OF_MEMORY;
193 }
194 z = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
195 if( z==NULL )
196 {
197 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
198 return LIS_OUT_OF_MEMORY;
199 }
200
201 iw = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
202 if( iw==NULL )
203 {
204 LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT));
205 return LIS_OUT_OF_MEMORY;
206 }
207 iw2 = (LIS_INT *)lis_malloc( nprocs*(n+1)*sizeof(LIS_INT), "lis_precon_create_iluc_csr::iw2" );
208 if( iw2==NULL )
209 {
210 LIS_SETERR_MEM(nprocs*(n+1)*sizeof(LIS_INT));
211 return LIS_OUT_OF_MEMORY;
212 }
213 wc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
214 if( wc==NULL )
215 {
216 LIS_SETERR_MEM(n*sizeof(LIS_INT));
217 return LIS_OUT_OF_MEMORY;
218 }
219 wl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
220 if( wl==NULL )
221 {
222 LIS_SETERR_MEM(n*sizeof(LIS_INT));
223 return LIS_OUT_OF_MEMORY;
224 }
225 iz = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
226 if( iw==NULL )
227 {
228 LIS_SETERR_MEM(n*sizeof(LIS_INT));
229 return LIS_OUT_OF_MEMORY;
230 }
231 zc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
232 if( wc==NULL )
233 {
234 LIS_SETERR_MEM(n*sizeof(LIS_INT));
235 return LIS_OUT_OF_MEMORY;
236 }
237 zl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
238 if( wl==NULL )
239 {
240 LIS_SETERR_MEM(n*sizeof(LIS_INT));
241 return LIS_OUT_OF_MEMORY;
242 }
243
244 nnz = A->L->nnz;
245 err = lis_matrix_malloc_csr(n,nnz,&ptr,&index,&value);
246
247
248 #pragma omp parallel private(i,ii,j,jj,k,kk,l,ll,is,ie,my_rank,cz,cw,toldd,t,nnz,len)
249 {
250 my_rank = omp_get_thread_num();
251 LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
252
253
254 #pragma omp for
255 for(i=0;i<nprocs;i++)
256 {
257 memset( &iw2[i*n], 0, n*sizeof(LIS_INT) );
258 }
259 #pragma omp for
260 for(i=0;i<=n;i++)
261 {
262 ptr[i] = 0;
263 }
264 #pragma omp for
265 for(i=0;i<n;i++)
266 {
267 for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
268 {
269 jj = A->L->index[j];
270 /* if( jj>=is && jj<ie )*/
271 {
272 iw2[my_rank*n + jj]++;
273 }
274 }
275 }
276 #pragma omp for
277 for(j=0;j<n;j++)
278 {
279 k = 0;
280 for(i=0;i<nprocs;i++)
281 {
282 k += iw2[i*n+j];
283 }
284 iw[j] = k;
285 }
286 #pragma omp single
287 for(j=0;j<n;j++)
288 {
289 ptr[j+1] = ptr[j] + iw[j];
290 }
291 #pragma omp for
292 for(j=0;j<n;j++)
293 {
294 k = ptr[j];
295 for(i=0;i<nprocs;i++)
296 {
297 l = iw2[i*n+j];
298 iw2[i*n+j] = k;
299 k = l + k;
300 }
301 }
302 #pragma omp for
303 for(i=0;i<n;i++)
304 {
305 for(j=ptr[i];j<ptr[i+1];j++)
306 {
307 value[j] = 0.0;
308 index[j] = 0;
309 }
310 }
311 #pragma omp for
312 for(i=0;i<n;i++)
313 {
314 for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
315 {
316 k = A->L->index[j];
317 /* if( k>=is && k<ie )*/
318 {
319 l = iw2[my_rank*n+k];
320 value[l] = A->L->value[j];
321 index[l] = i;
322 iw2[my_rank*n+k]++;
323 }
324 }
325 }
326
327
328 for(i=is;i<ie;i++)
329 {
330 D->value[i] = A->D->value[i];
331 zl[i] = -1;
332 wl[i] = -1;
333 zc[i] = 0;
334 wc[i] = 0;
335 }
336
337
338 for(k=is;k<ie;k++)
339 {
340 cz = 0;
341 cw = 0;
342 /* z_j = a_kj (j>=k) */
343 for(j=A->U->ptr[k];j<A->U->ptr[k+1];j++)
344 {
345 jj = A->U->index[j];
346 if( jj<is || jj>=ie ) continue;
347 #ifdef USE_MPI
348 if( jj>n-1 ) continue;
349 #endif
350 z[jj] = A->U->value[j];
351 iz[my_rank*n+cz++] = jj;
352 zc[jj] = 1;
353 }
354
355 /* w_j = a_jk (j>k) */
356 for(j=ptr[k];j<ptr[k+1];j++)
357 {
358 jj = index[j];
359 if( jj<is || jj>=ie ) continue;
360 w[jj] = value[j];
361 iw[my_rank*n+cw++] = jj;
362 wc[jj] = 1;
363 }
364
365 /* z_j = z_j - l_ki*u_ij */
366 i = wl[k];
367 while( i>=is )
368 {
369 ii = wc[i];
370 kk = zc[i];
371 val = L->value[i][ii];
372 for(j=kk;j<U->nnz[i];j++)
373 {
374 jj = U->index[i][j];
375 if( jj==k ) continue;
376 if( zc[jj]==1 )
377 {
378 z[jj] -= val*U->value[i][j];
379 }
380 else
381 {
382 z[jj] = -val*U->value[i][j];
383 iz[my_rank*n+cz++] = jj;
384 zc[jj] = 1;
385 }
386 }
387 ii++;
388 wc[i] = ii;
389 if( ii<L->nnz[i] )
390 {
391 kk = L->index[i][ii];
392 ll = i;
393 i = wl[ll];
394 wl[ll] = wl[kk];
395 wl[kk] = ll;
396 }
397 else
398 {
399 i = wl[i];
400 }
401 }
402
403 /* w_j = w_j - l_ji*u_ik */
404 i = zl[k];
405 while( i>=is )
406 {
407 ii = zc[i];
408 kk = wc[i];
409 val = U->value[i][ii];
410 for(j=kk;j<L->nnz[i];j++)
411 {
412 jj = L->index[i][j];
413 if( wc[jj]==1 )
414 {
415 w[jj] -= val*L->value[i][j];
416 }
417 else
418 {
419 w[jj] = -val*L->value[i][j];
420 iw[my_rank*n+cw++] = jj;
421 wc[jj] = 1;
422 }
423 }
424 ii++;
425 zc[i] = ii;
426 if( ii<U->nnz[i] )
427 {
428 kk = U->index[i][ii];
429 ll = i;
430 i = zl[ll];
431 zl[ll] = zl[kk];
432 zl[kk] = ll;
433 }
434 else
435 {
436 i = zl[i];
437 }
438 }
439
440 toldd = fabs(D->value[k])*tol;
441 D->value[k] = 1.0/D->value[k];
442 t = D->value[k];
443 if( cz<cw )
444 {
445 for(j=0;j<cz;j++)
446 {
447 jj = iz[my_rank*n+j];
448 if( wc[jj] )
449 {
450 D->value[jj] -= z[jj]*w[jj]*t;
451 }
452 }
453 }
454 else
455 {
456 for(j=0;j<cw;j++)
457 {
458 jj = iw[my_rank*n+j];
459 if( zc[jj] )
460 {
461 D->value[jj] -= z[jj]*w[jj]*t;
462 }
463 }
464 }
465 /* drop U */
466 nnz = 0;
467 for(j=0;j<cz;j++)
468 {
469 jj = iz[my_rank*n+j];
470 if( fabs(z[jj])>toldd )
471 {
472 iz[my_rank*n+nnz++] = jj;
473 }
474 else
475 {
476 z[k] = z[k] + fabs(z[jj]);
477 zc[jj] = 0;
478 }
479 }
480 len = _min(lfil,nnz);
481 for(j=0;j<nnz;j++) tmp[my_rank*n+j] = fabs(z[is+j]);
482 lis_sort_di(0,nnz-1,&tmp[my_rank*n],&iz[my_rank*n]);
483 lis_sort_i(0,len-1,&iz[my_rank*n]);
484 U->nnz[k] = len;
485 if( len>0 )
486 {
487 U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
488 U->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
489 }
490 for(j=0;j<len;j++)
491 {
492 jj = iz[my_rank*n+j];
493 U->index[k][j] = jj;
494 U->value[k][j] = z[jj];
495 }
496 for(j=len;j<cz;j++) zc[iz[my_rank*n+j]] = 0;
497 cz = nnz;
498
499 /* drop L */
500 nnz = 0;
501 for(j=0;j<cw;j++)
502 {
503 jj = iw[my_rank*n+j];
504 if( fabs(w[jj])>toldd )
505 {
506 iw[my_rank*n+nnz++] = jj;
507 }
508 else
509 {
510 wc[jj] = 0;
511 }
512 }
513 len = _min(lfil,nnz);
514 for(j=0;j<nnz;j++) tmp[my_rank*n+j] = fabs(w[is+j]);
515 lis_sort_di(0,nnz-1,&tmp[my_rank*n],&iw[my_rank*n]);
516 lis_sort_i(0,len-1,&iw[my_rank*n]);
517 L->nnz[k] = len;
518 if( len>0 )
519 {
520 L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
521 L->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
522 }
523 for(j=0;j<len;j++)
524 {
525 jj = iw[my_rank*n+j];
526 L->index[k][j] = jj;
527 L->value[k][j] = w[jj]*t;
528 }
529 for(j=len;j<cw;j++) wc[iw[my_rank*n+j]] = 0;
530 cw = nnz;
531
532 /**/
533 for(j=0;j<cz;j++) zc[iz[my_rank*n+j]] = 0;
534 for(j=0;j<cw;j++) wc[iw[my_rank*n+j]] = 0;
535 if(U->nnz[k]>0)
536 {
537 jj = iz[my_rank*n+0];
538 zc[k] = 0;
539 zl[k] = zl[jj];
540 zl[jj] = k;
541 }
542 if(L->nnz[k]>0)
543 {
544 jj = iw[my_rank*n+0];
545 wc[k] = 0;
546 wl[k] = wl[jj];
547 wl[jj] = k;
548 }
549 }
550 }
551
552 precon->L = L;
553 precon->U = U;
554 precon->D = D;
555
556 lis_free2(13,tmp,w,z,iw,iw2,wc,wl,iz,zc,zl,ptr,index,value);
557
558 LIS_DEBUG_FUNC_OUT;
559 return LIS_SUCCESS;
560 #else
561 LIS_INT err;
562 LIS_INT i,j,k,l,ii,jj,kk,ll;
563 LIS_INT n,nnz,annz,lfil,len;
564 LIS_INT cz,cw;
565 LIS_INT *iw,*wc,*wl,*iz,*zc,*zl;
566 LIS_INT *index,*ptr;
567 LIS_SCALAR gamma,val;
568 LIS_REAL t,tol,toldd;
569 LIS_SCALAR *z,*w,*tmp;
570 LIS_SCALAR *value;
571 LIS_SCALAR m;
572 LIS_MATRIX A;
573 LIS_MATRIX_ILU L,U;
574 LIS_VECTOR D;
575
576 LIS_DEBUG_FUNC_IN;
577
578
579 A = solver->A;
580 n = A->n;
581 tol = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
582 m = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
583 gamma = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
584 annz = 10+A->nnz / A->n;
585 lfil = (LIS_INT)(((double)A->nnz/(2.0*n))*m);
586
587 L = NULL;
588 U = NULL;
589
590
591 err = lis_matrix_ilu_create(n,1,&L);
592 if( err ) return err;
593 err = lis_matrix_ilu_create(n,1,&U);
594 if( err ) return err;
595 err = lis_matrix_ilu_setCR(L);
596 if( err ) return err;
597 err = lis_matrix_ilu_setCR(U);
598 if( err ) return err;
599 err = lis_vector_duplicate(A,&D);
600 if( err )
601 {
602 return err;
603 }
604
605 tmp = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
606 if( tmp==NULL )
607 {
608 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
609 return LIS_OUT_OF_MEMORY;
610 }
611 w = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
612 if( w==NULL )
613 {
614 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
615 return LIS_OUT_OF_MEMORY;
616 }
617 z = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
618 if( z==NULL )
619 {
620 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
621 return LIS_OUT_OF_MEMORY;
622 }
623
624 iw = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
625 if( iw==NULL )
626 {
627 LIS_SETERR_MEM(n*sizeof(LIS_INT));
628 return LIS_OUT_OF_MEMORY;
629 }
630 wc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
631 if( wc==NULL )
632 {
633 LIS_SETERR_MEM(n*sizeof(LIS_INT));
634 return LIS_OUT_OF_MEMORY;
635 }
636 wl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
637 if( wl==NULL )
638 {
639 LIS_SETERR_MEM(n*sizeof(LIS_INT));
640 return LIS_OUT_OF_MEMORY;
641 }
642 iz = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
643 if( iw==NULL )
644 {
645 LIS_SETERR_MEM(n*sizeof(LIS_INT));
646 return LIS_OUT_OF_MEMORY;
647 }
648 zc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
649 if( wc==NULL )
650 {
651 LIS_SETERR_MEM(n*sizeof(LIS_INT));
652 return LIS_OUT_OF_MEMORY;
653 }
654 zl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
655 if( wl==NULL )
656 {
657 LIS_SETERR_MEM(n*sizeof(LIS_INT));
658 return LIS_OUT_OF_MEMORY;
659 }
660
661 lis_matrix_sort_csr(A);
662 nnz = A->L->nnz;
663 err = lis_matrix_malloc_csr(n,nnz,&ptr,&index,&value);
664
665 for(i=0;i<n;i++) iw[i] = 0;
666 for(i=0;i<n;i++)
667 {
668 for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
669 {
670 jj = A->L->index[j];
671 iw[jj]++;
672 }
673 }
674 ptr[0] = 0;
675 for(i=0;i<n;i++)
676 {
677 ptr[i+1] = ptr[i] + iw[i];
678 iw[i] = ptr[i];
679 }
680 for(i=0;i<n;i++)
681 {
682 for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
683 {
684 k = A->L->index[j];
685 l = iw[k];
686 value[l] = A->L->value[j];
687 index[l] = i;
688 iw[k]++;
689 }
690 }
691
692
693 for(i=0;i<n;i++)
694 {
695 D->value[i] = gamma * A->D->value[i];
696 zl[i] = -1;
697 wl[i] = -1;
698 zc[i] = 0;
699 wc[i] = 0;
700 }
701
702
703 for(k=0;k<n;k++)
704 {
705 cz = 0;
706 cw = 0;
707 /* z_j = a_kj (j>=k) */
708 for(j=A->U->ptr[k];j<A->U->ptr[k+1];j++)
709 {
710 jj = A->U->index[j];
711 #ifdef USE_MPI
712 if( jj>n-1 ) continue;
713 #endif
714 z[jj] = A->U->value[j];
715 iz[cz++] = jj;
716 zc[jj] = 1;
717 }
718
719 /* w_j = a_jk (j>k) */
720 for(j=ptr[k];j<ptr[k+1];j++)
721 {
722 jj = index[j];
723 w[jj] = value[j];
724 iw[cw++] = jj;
725 wc[jj] = 1;
726 }
727
728 /* z_j = z_j - l_ki*u_ij */
729 i = wl[k];
730 while( i>=0 )
731 {
732 ii = wc[i];
733 kk = zc[i];
734 val = L->value[i][ii];
735 for(j=kk;j<U->nnz[i];j++)
736 {
737 jj = U->index[i][j];
738 if( jj==k ) continue;
739 if( zc[jj]==1 )
740 {
741 z[jj] -= val*U->value[i][j];
742 }
743 else
744 {
745 z[jj] = -val*U->value[i][j];
746 iz[cz++] = jj;
747 zc[jj] = 1;
748 }
749 }
750 ii++;
751 wc[i] = ii;
752 if( ii<L->nnz[i] )
753 {
754 kk = L->index[i][ii];
755 ll = i;
756 i = wl[ll];
757 wl[ll] = wl[kk];
758 wl[kk] = ll;
759 }
760 else
761 {
762 i = wl[i];
763 }
764 }
765
766 /* w_j = w_j - l_ji*u_ik */
767 i = zl[k];
768 while( i>=0 )
769 {
770 ii = zc[i];
771 kk = wc[i];
772 val = U->value[i][ii];
773 for(j=kk;j<L->nnz[i];j++)
774 {
775 jj = L->index[i][j];
776 if( wc[jj]==1 )
777 {
778 w[jj] -= val*L->value[i][j];
779 }
780 else
781 {
782 w[jj] = -val*L->value[i][j];
783 iw[cw++] = jj;
784 wc[jj] = 1;
785 }
786 }
787 ii++;
788 zc[i] = ii;
789 if( ii<U->nnz[i] )
790 {
791 kk = U->index[i][ii];
792 ll = i;
793 i = zl[ll];
794 zl[ll] = zl[kk];
795 zl[kk] = ll;
796 }
797 else
798 {
799 i = zl[i];
800 }
801 }
802
803 toldd = fabs(D->value[k])*tol;
804 D->value[k] = 1.0/D->value[k];
805 t = D->value[k];
806 if( cz<cw )
807 {
808 for(j=0;j<cz;j++)
809 {
810 jj = iz[j];
811 if( wc[jj] )
812 {
813 D->value[jj] -= z[jj]*w[jj]*t;
814 }
815 }
816 }
817 else
818 {
819 for(j=0;j<cw;j++)
820 {
821 jj = iw[j];
822 if( zc[jj] )
823 {
824 D->value[jj] -= z[jj]*w[jj]*t;
825 }
826 }
827 }
828 /* drop U */
829 nnz = 0;
830 for(j=0;j<cz;j++)
831 {
832 jj = iz[j];
833 if( fabs(z[jj])>toldd )
834 {
835 iz[nnz++] = jj;
836 }
837 else
838 {
839 z[k] = z[k] + fabs(z[jj]);
840 zc[jj] = 0;
841 }
842 }
843 len = _min(lfil,nnz);
844 for(j=0;j<nnz;j++) tmp[j] = fabs(z[j]);
845 lis_sort_di(0,nnz-1,tmp,iz);
846 lis_sort_i(0,len-1,iz);
847 U->nnz[k] = len;
848 if( len>0 )
849 {
850 U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
851 U->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
852 }
853 for(j=0;j<len;j++)
854 {
855 jj = iz[j];
856 U->index[k][j] = jj;
857 U->value[k][j] = z[jj];
858 }
859 for(j=len;j<cz;j++) zc[iz[j]] = 0;
860 cz = nnz;
861
862 /* drop L */
863 nnz = 0;
864 for(j=0;j<cw;j++)
865 {
866 jj = iw[j];
867 if( fabs(w[jj])>toldd )
868 {
869 iw[nnz++] = jj;
870 }
871 else
872 {
873 wc[jj] = 0;
874 }
875 }
876 len = _min(lfil,nnz);
877 for(j=0;j<nnz;j++) tmp[j] = fabs(w[j]);
878 lis_sort_di(0,nnz-1,tmp,iw);
879 lis_sort_i(0,len-1,iw);
880 L->nnz[k] = len;
881 if( len>0 )
882 {
883 L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
884 L->value[k] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
885 }
886 for(j=0;j<len;j++)
887 {
888 jj = iw[j];
889 L->index[k][j] = jj;
890 L->value[k][j] = w[jj]*t;
891 }
892 for(j=len;j<cw;j++) wc[iw[j]] = 0;
893 cw = nnz;
894
895 /**/
896 for(j=0;j<cz;j++) zc[iz[j]] = 0;
897 for(j=0;j<cw;j++) wc[iw[j]] = 0;
898 if(U->nnz[k]>0)
899 {
900 jj = iz[0];
901 zc[k] = 0;
902 zl[k] = zl[jj];
903 zl[jj] = k;
904 }
905 if(L->nnz[k]>0)
906 {
907 jj = iw[0];
908 wc[k] = 0;
909 wl[k] = wl[jj];
910 wl[jj] = k;
911 }
912 }
913
914 precon->L = L;
915 precon->U = U;
916 precon->D = D;
917
918 lis_free2(12,tmp,w,z,iw,wc,wl,iz,zc,zl,ptr,index,value);
919
920 LIS_DEBUG_FUNC_OUT;
921 return LIS_SUCCESS;
922 #endif
923 }
924
925 #undef __FUNC__
926 #define __FUNC__ "lis_precon_create_iluc_bsr"
lis_precon_create_iluc_bsr(LIS_SOLVER solver,LIS_PRECON precon)927 LIS_INT lis_precon_create_iluc_bsr(LIS_SOLVER solver, LIS_PRECON precon)
928 {
929 LIS_Comm comm;
930 LIS_INT err;
931 LIS_INT i,j,k,l,ii,jj,kk,ll,bnr,bs;
932 LIS_INT n,nr,nnz,bnnz,lfil,len;
933 LIS_INT cz,cw;
934 LIS_INT *iw,*wc,*wl,*iz,*zc,*zl;
935 LIS_INT *index,*ptr;
936 LIS_SCALAR gamma,val;
937 LIS_REAL t,tol,toldd;
938 LIS_SCALAR *z,*w,*tmp,wz[9],lu[9];
939 LIS_SCALAR *value;
940 LIS_SCALAR m,a;
941 LIS_MATRIX A;
942 LIS_MATRIX_ILU L,U;
943 LIS_MATRIX_DIAG D;
944
945 LIS_DEBUG_FUNC_IN;
946
947 comm = LIS_COMM_WORLD;
948
949 A = solver->A;
950 n = A->n;
951 nr = A->nr;
952 bnr = A->bnr;
953 bs = bnr*bnr;
954 tol = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
955 m = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
956 gamma = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
957 lfil = (LIS_INT)(((double)A->bnnz/(2.0*nr))*m);
958
959 L = NULL;
960 U = NULL;
961
962
963 err = lis_matrix_ilu_create(nr,bnr,&L);
964 if( err ) return err;
965 err = lis_matrix_ilu_create(nr,bnr,&U);
966 if( err ) return err;
967 err = lis_matrix_ilu_setCR(L);
968 if( err ) return err;
969 err = lis_matrix_ilu_setCR(U);
970 if( err ) return err;
971 err = lis_matrix_diag_duplicateM(A,&D);
972 if( err )
973 {
974 return err;
975 }
976
977 tmp = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::tmp");
978 if( tmp==NULL )
979 {
980 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
981 return LIS_OUT_OF_MEMORY;
982 }
983 w = (LIS_SCALAR *)lis_malloc(bs*nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
984 if( w==NULL )
985 {
986 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
987 return LIS_OUT_OF_MEMORY;
988 }
989 z = (LIS_SCALAR *)lis_malloc(bs*nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::z");
990 if( z==NULL )
991 {
992 LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
993 return LIS_OUT_OF_MEMORY;
994 }
995
996 iw = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
997 if( iw==NULL )
998 {
999 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1000 return LIS_OUT_OF_MEMORY;
1001 }
1002 wc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wc");
1003 if( wc==NULL )
1004 {
1005 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1006 return LIS_OUT_OF_MEMORY;
1007 }
1008 wl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::wl");
1009 if( wl==NULL )
1010 {
1011 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1012 return LIS_OUT_OF_MEMORY;
1013 }
1014 iz = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iz");
1015 if( iw==NULL )
1016 {
1017 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1018 return LIS_OUT_OF_MEMORY;
1019 }
1020 zc = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zc");
1021 if( wc==NULL )
1022 {
1023 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1024 return LIS_OUT_OF_MEMORY;
1025 }
1026 zl = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::zl");
1027 if( wl==NULL )
1028 {
1029 LIS_SETERR_MEM(n*sizeof(LIS_INT));
1030 return LIS_OUT_OF_MEMORY;
1031 }
1032
1033 bnnz = A->L->bnnz;
1034 err = lis_matrix_malloc_bsr(n,bnr,bnr,bnnz,&ptr,&index,&value);
1035
1036 for(i=0;i<nr;i++) iw[i] = 0;
1037 for(i=0;i<nr;i++)
1038 {
1039 for(j=A->L->bptr[i];j<A->L->bptr[i+1];j++)
1040 {
1041 jj = A->L->bindex[j];
1042 iw[jj]++;
1043 }
1044 }
1045 ptr[0] = 0;
1046 for(i=0;i<nr;i++)
1047 {
1048 ptr[i+1] = ptr[i] + iw[i];
1049 iw[i] = ptr[i];
1050 }
1051 for(i=0;i<nr;i++)
1052 {
1053 for(j=A->L->bptr[i];j<A->L->bptr[i+1];j++)
1054 {
1055 k = A->L->bindex[j];
1056 l = iw[k];
1057 memcpy(&value[bs*l],&A->L->value[bs*j],bs*sizeof(LIS_SCALAR));
1058 index[l] = i;
1059 iw[k]++;
1060 }
1061 }
1062
1063
1064 switch(bnr)
1065 {
1066 case 1:
1067 for(i=0;i<nr;i++)
1068 {
1069 D->value[i] = gamma*A->D->value[i];
1070 zl[i] = -1;
1071 wl[i] = -1;
1072 zc[i] = 0;
1073 wc[i] = 0;
1074 }
1075 break;
1076 case 2:
1077 for(i=0;i<nr;i++)
1078 {
1079 D->value[4*i+0] = gamma*A->D->value[4*i+0];
1080 D->value[4*i+1] = gamma*A->D->value[4*i+1];
1081 D->value[4*i+2] = gamma*A->D->value[4*i+2];
1082 D->value[4*i+3] = gamma*A->D->value[4*i+3];
1083 zl[i] = -1;
1084 wl[i] = -1;
1085 zc[i] = 0;
1086 wc[i] = 0;
1087 }
1088 if( n%2!=0 )
1089 {
1090 D->value[4*(nr-1)+3] = 1.0;
1091 }
1092 break;
1093 case 3:
1094 for(i=0;i<nr;i++)
1095 {
1096 D->value[9*i+0] = gamma*A->D->value[9*i+0];
1097 D->value[9*i+1] = gamma*A->D->value[9*i+1];
1098 D->value[9*i+2] = gamma*A->D->value[9*i+2];
1099 D->value[9*i+3] = gamma*A->D->value[9*i+3];
1100 D->value[9*i+4] = gamma*A->D->value[9*i+4];
1101 D->value[9*i+5] = gamma*A->D->value[9*i+5];
1102 D->value[9*i+6] = gamma*A->D->value[9*i+6];
1103 D->value[9*i+7] = gamma*A->D->value[9*i+7];
1104 D->value[9*i+8] = gamma*A->D->value[9*i+8];
1105 zl[i] = -1;
1106 wl[i] = -1;
1107 zc[i] = 0;
1108 wc[i] = 0;
1109 }
1110 if( n%3==1 )
1111 {
1112 D->value[9*(nr-1)+4] = 1.0;
1113 D->value[9*(nr-1)+8] = 1.0;
1114 }
1115 else if( n%3==2 )
1116 {
1117 D->value[9*(nr-1)+8] = 1.0;
1118 }
1119 break;
1120 }
1121
1122
1123 for(k=0;k<nr;k++)
1124 {
1125 cz = 0;
1126 cw = 0;
1127 /* z_j = a_kj (j>=k) */
1128 for(j=A->U->bptr[k];j<A->U->bptr[k+1];j++)
1129 {
1130 jj = A->U->bindex[j];
1131 #ifdef USE_MPI
1132 if( jj>nr-1 ) continue;
1133 #endif
1134 #if 0
1135 switch(bnr)
1136 {
1137 case 1:
1138 toldd = fabs(D->value[k]);
1139 t = fabs(A->U->value[j]);
1140 break;
1141 case 2:
1142 toldd = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1143 toldd += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1144 /* toldd = sqrt(toldd);*/
1145 t = fabs(A->U->value[4*j+0]) + fabs(A->U->value[4*j+1]);
1146 t += fabs(A->U->value[4*j+2]) + fabs(A->U->value[4*j+3]);
1147 /* t = sqrt(t);*/
1148 break;
1149 case 3:
1150 toldd = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1151 toldd += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1152 toldd += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1153 /* toldd = sqrt(toldd);*/
1154 t = fabs(A->U->value[9*j+0]) + fabs(A->U->value[9*j+1]) + fabs(A->U->value[9*j+2]);
1155 t += fabs(A->U->value[9*j+3]) + fabs(A->U->value[9*j+4]) + fabs(A->U->value[9*j+5]);
1156 t += fabs(A->U->value[9*j+6]) + fabs(A->U->value[9*j+7]) + fabs(A->U->value[9*j+8]);
1157 /* t = sqrt(t);*/
1158 break;
1159 }
1160 if( toldd>t*1.0e-10 )
1161 {
1162 memcpy(&z[bs*jj],&A->U->value[bs*j],bs*sizeof(LIS_SCALAR));
1163 iz[cz++] = jj;
1164 zc[jj] = 1;
1165 }
1166 #else
1167 memcpy(&z[bs*jj],&A->U->value[bs*j],bs*sizeof(LIS_SCALAR));
1168 iz[cz++] = jj;
1169 zc[jj] = 1;
1170 #endif
1171 }
1172
1173 /* w_j = a_jk (j>k) */
1174 for(j=ptr[k];j<ptr[k+1];j++)
1175 {
1176 jj = index[j];
1177 #if 0
1178 switch(bnr)
1179 {
1180 case 1:
1181 toldd = fabs(D->value[k]);
1182 t = fabs(value[j]);
1183 break;
1184 case 2:
1185 toldd = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1186 toldd += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1187 /* toldd = sqrt(toldd);*/
1188 t = fabs(value[4*j+0]) + fabs(value[4*j+1]);
1189 t += fabs(value[4*j+2]) + fabs(value[4*j+3]);
1190 /* t = sqrt(t);*/
1191 break;
1192 case 3:
1193 toldd = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1194 toldd += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1195 toldd += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1196 /* toldd = sqrt(toldd);*/
1197 t = fabs(value[9*j+0]) + fabs(value[9*j+1]) + fabs(value[9*j+2]);
1198 t += fabs(value[9*j+3]) + fabs(value[9*j+4]) + fabs(value[9*j+5]);
1199 t += fabs(value[9*j+6]) + fabs(value[9*j+7]) + fabs(value[9*j+8]);
1200 /* t = sqrt(t);*/
1201 break;
1202 }
1203 if( toldd>t*1.0e-10 )
1204 {
1205 memcpy(&w[bs*jj],&value[bs*j],bs*sizeof(LIS_SCALAR));
1206 iw[cw++] = jj;
1207 wc[jj] = 1;
1208 }
1209 #else
1210 memcpy(&w[bs*jj],&value[bs*j],bs*sizeof(LIS_SCALAR));
1211 iw[cw++] = jj;
1212 wc[jj] = 1;
1213 #endif
1214 }
1215
1216 /* z_j = z_j - l_ki*u_ij */
1217 i = wl[k];
1218 while( i>=0 )
1219 {
1220 ii = wc[i];
1221 kk = zc[i];
1222 val = L->value[i][ii];
1223 for(j=kk;j<U->nnz[i];j++)
1224 {
1225 jj = U->index[i][j];
1226 #if 1
1227 if( jj==k ) continue;
1228 if( zc[jj]==1 )
1229 #else
1230 if( jj==k )
1231 {
1232 switch(bnr)
1233 {
1234 case 1:
1235 D->value[jj] -= val*U->value[i][j];
1236 break;
1237 case 2:
1238 D->value[4*jj+0] -= L->value[i][4*ii+0]*U->value[i][4*j+0];
1239 D->value[4*jj+0] -= L->value[i][4*ii+2]*U->value[i][4*j+1];
1240 D->value[4*jj+1] -= L->value[i][4*ii+1]*U->value[i][4*j+0];
1241 D->value[4*jj+1] -= L->value[i][4*ii+3]*U->value[i][4*j+1];
1242 D->value[4*jj+2] -= L->value[i][4*ii+0]*U->value[i][4*j+2];
1243 D->value[4*jj+2] -= L->value[i][4*ii+2]*U->value[i][4*j+3];
1244 D->value[4*jj+3] -= L->value[i][4*ii+1]*U->value[i][4*j+2];
1245 D->value[4*jj+3] -= L->value[i][4*ii+3]*U->value[i][4*j+3];
1246 break;
1247 case 3:
1248 D->value[9*jj+0] -= L->value[i][9*ii+0]*U->value[i][9*j+0] + L->value[i][9*ii+3]*U->value[i][9*j+1] + L->value[i][9*ii+6]*U->value[i][9*j+2];
1249 D->value[9*jj+1] -= L->value[i][9*ii+1]*U->value[i][9*j+0] + L->value[i][9*ii+4]*U->value[i][9*j+1] + L->value[i][9*ii+7]*U->value[i][9*j+2];
1250 D->value[9*jj+2] -= L->value[i][9*ii+2]*U->value[i][9*j+0] + L->value[i][9*ii+5]*U->value[i][9*j+1] + L->value[i][9*ii+8]*U->value[i][9*j+2];
1251 D->value[9*jj+3] -= L->value[i][9*ii+0]*U->value[i][9*j+3] + L->value[i][9*ii+3]*U->value[i][9*j+4] + L->value[i][9*ii+6]*U->value[i][9*j+5];
1252 D->value[9*jj+4] -= L->value[i][9*ii+1]*U->value[i][9*j+3] + L->value[i][9*ii+4]*U->value[i][9*j+4] + L->value[i][9*ii+7]*U->value[i][9*j+5];
1253 D->value[9*jj+5] -= L->value[i][9*ii+2]*U->value[i][9*j+3] + L->value[i][9*ii+5]*U->value[i][9*j+4] + L->value[i][9*ii+8]*U->value[i][9*j+5];
1254 D->value[9*jj+6] -= L->value[i][9*ii+0]*U->value[i][9*j+6] + L->value[i][9*ii+3]*U->value[i][9*j+7] + L->value[i][9*ii+6]*U->value[i][9*j+8];
1255 D->value[9*jj+7] -= L->value[i][9*ii+1]*U->value[i][9*j+6] + L->value[i][9*ii+4]*U->value[i][9*j+7] + L->value[i][9*ii+7]*U->value[i][9*j+8];
1256 D->value[9*jj+8] -= L->value[i][9*ii+2]*U->value[i][9*j+6] + L->value[i][9*ii+5]*U->value[i][9*j+7] + L->value[i][9*ii+8]*U->value[i][9*j+8];
1257 break;
1258 }
1259 }
1260 else if( zc[jj]==1 )
1261 #endif
1262 {
1263 switch(bnr)
1264 {
1265 case 1:
1266 z[jj] -= val*U->value[i][j];
1267 break;
1268 case 2:
1269 z[4*jj+0] -= L->value[i][4*ii+0]*U->value[i][4*j+0];
1270 z[4*jj+0] -= L->value[i][4*ii+2]*U->value[i][4*j+1];
1271 z[4*jj+1] -= L->value[i][4*ii+1]*U->value[i][4*j+0];
1272 z[4*jj+1] -= L->value[i][4*ii+3]*U->value[i][4*j+1];
1273 z[4*jj+2] -= L->value[i][4*ii+0]*U->value[i][4*j+2];
1274 z[4*jj+2] -= L->value[i][4*ii+2]*U->value[i][4*j+3];
1275 z[4*jj+3] -= L->value[i][4*ii+1]*U->value[i][4*j+2];
1276 z[4*jj+3] -= L->value[i][4*ii+3]*U->value[i][4*j+3];
1277 break;
1278 case 3:
1279 z[9*jj+0] -= L->value[i][9*ii+0]*U->value[i][9*j+0] + L->value[i][9*ii+3]*U->value[i][9*j+1] + L->value[i][9*ii+6]*U->value[i][9*j+2];
1280 z[9*jj+1] -= L->value[i][9*ii+1]*U->value[i][9*j+0] + L->value[i][9*ii+4]*U->value[i][9*j+1] + L->value[i][9*ii+7]*U->value[i][9*j+2];
1281 z[9*jj+2] -= L->value[i][9*ii+2]*U->value[i][9*j+0] + L->value[i][9*ii+5]*U->value[i][9*j+1] + L->value[i][9*ii+8]*U->value[i][9*j+2];
1282 z[9*jj+3] -= L->value[i][9*ii+0]*U->value[i][9*j+3] + L->value[i][9*ii+3]*U->value[i][9*j+4] + L->value[i][9*ii+6]*U->value[i][9*j+5];
1283 z[9*jj+4] -= L->value[i][9*ii+1]*U->value[i][9*j+3] + L->value[i][9*ii+4]*U->value[i][9*j+4] + L->value[i][9*ii+7]*U->value[i][9*j+5];
1284 z[9*jj+5] -= L->value[i][9*ii+2]*U->value[i][9*j+3] + L->value[i][9*ii+5]*U->value[i][9*j+4] + L->value[i][9*ii+8]*U->value[i][9*j+5];
1285 z[9*jj+6] -= L->value[i][9*ii+0]*U->value[i][9*j+6] + L->value[i][9*ii+3]*U->value[i][9*j+7] + L->value[i][9*ii+6]*U->value[i][9*j+8];
1286 z[9*jj+7] -= L->value[i][9*ii+1]*U->value[i][9*j+6] + L->value[i][9*ii+4]*U->value[i][9*j+7] + L->value[i][9*ii+7]*U->value[i][9*j+8];
1287 z[9*jj+8] -= L->value[i][9*ii+2]*U->value[i][9*j+6] + L->value[i][9*ii+5]*U->value[i][9*j+7] + L->value[i][9*ii+8]*U->value[i][9*j+8];
1288 break;
1289 }
1290 }
1291 else
1292 {
1293 switch(bnr)
1294 {
1295 case 1:
1296 z[jj] = -val*U->value[i][j];
1297 break;
1298 case 2:
1299 z[4*jj+0] = -L->value[i][4*ii+0]*U->value[i][4*j+0] - L->value[i][4*ii+2]*U->value[i][4*j+1];
1300 z[4*jj+1] = -L->value[i][4*ii+1]*U->value[i][4*j+0] - L->value[i][4*ii+3]*U->value[i][4*j+1];
1301 z[4*jj+2] = -L->value[i][4*ii+0]*U->value[i][4*j+2] - L->value[i][4*ii+2]*U->value[i][4*j+3];
1302 z[4*jj+3] = -L->value[i][4*ii+1]*U->value[i][4*j+2] - L->value[i][4*ii+3]*U->value[i][4*j+3];
1303 break;
1304 case 3:
1305 z[9*jj+0] = -L->value[i][9*ii+0]*U->value[i][9*j+0] - L->value[i][9*ii+3]*U->value[i][9*j+1] - L->value[i][9*ii+6]*U->value[i][9*j+2];
1306 z[9*jj+1] = -L->value[i][9*ii+1]*U->value[i][9*j+0] - L->value[i][9*ii+4]*U->value[i][9*j+1] - L->value[i][9*ii+7]*U->value[i][9*j+2];
1307 z[9*jj+2] = -L->value[i][9*ii+2]*U->value[i][9*j+0] - L->value[i][9*ii+5]*U->value[i][9*j+1] - L->value[i][9*ii+8]*U->value[i][9*j+2];
1308 z[9*jj+3] = -L->value[i][9*ii+0]*U->value[i][9*j+3] - L->value[i][9*ii+3]*U->value[i][9*j+4] - L->value[i][9*ii+6]*U->value[i][9*j+5];
1309 z[9*jj+4] = -L->value[i][9*ii+1]*U->value[i][9*j+3] - L->value[i][9*ii+4]*U->value[i][9*j+4] - L->value[i][9*ii+7]*U->value[i][9*j+5];
1310 z[9*jj+5] = -L->value[i][9*ii+2]*U->value[i][9*j+3] - L->value[i][9*ii+5]*U->value[i][9*j+4] - L->value[i][9*ii+8]*U->value[i][9*j+5];
1311 z[9*jj+6] = -L->value[i][9*ii+0]*U->value[i][9*j+6] - L->value[i][9*ii+3]*U->value[i][9*j+7] - L->value[i][9*ii+6]*U->value[i][9*j+8];
1312 z[9*jj+7] = -L->value[i][9*ii+1]*U->value[i][9*j+6] - L->value[i][9*ii+4]*U->value[i][9*j+7] - L->value[i][9*ii+7]*U->value[i][9*j+8];
1313 z[9*jj+8] = -L->value[i][9*ii+2]*U->value[i][9*j+6] - L->value[i][9*ii+5]*U->value[i][9*j+7] - L->value[i][9*ii+8]*U->value[i][9*j+8];
1314 break;
1315 }
1316 iz[cz++] = jj;
1317 zc[jj] = 1;
1318 }
1319 }
1320 ii++;
1321 wc[i] = ii;
1322 if( ii<L->nnz[i] )
1323 {
1324 kk = L->index[i][ii];
1325 ll = i;
1326 i = wl[ll];
1327 wl[ll] = wl[kk];
1328 wl[kk] = ll;
1329 }
1330 else
1331 {
1332 i = wl[i];
1333 }
1334 }
1335
1336 /* w_j = w_j - l_ji*u_ik */
1337 i = zl[k];
1338 while( i>=0 )
1339 {
1340 ii = zc[i];
1341 kk = wc[i];
1342 val = U->value[i][ii];
1343 for(j=kk;j<L->nnz[i];j++)
1344 {
1345 jj = L->index[i][j];
1346 if( wc[jj]==1 )
1347 {
1348 switch(bnr)
1349 {
1350 case 1:
1351 w[jj] -= val*L->value[i][j];
1352 break;
1353 case 2:
1354 w[4*jj+0] -= U->value[i][4*ii+0]*L->value[i][4*j+0];
1355 w[4*jj+0] -= U->value[i][4*ii+2]*L->value[i][4*j+1];
1356 w[4*jj+1] -= U->value[i][4*ii+1]*L->value[i][4*j+0];
1357 w[4*jj+1] -= U->value[i][4*ii+3]*L->value[i][4*j+1];
1358 w[4*jj+2] -= U->value[i][4*ii+0]*L->value[i][4*j+2];
1359 w[4*jj+2] -= U->value[i][4*ii+2]*L->value[i][4*j+3];
1360 w[4*jj+3] -= U->value[i][4*ii+1]*L->value[i][4*j+2];
1361 w[4*jj+3] -= U->value[i][4*ii+3]*L->value[i][4*j+3];
1362
1363 /* w[4*jj+0] -= L->value[i][4*j+0]*U->value[i][4*ii+0];
1364 w[4*jj+0] -= L->value[i][4*j+2]*U->value[i][4*ii+1];
1365 w[4*jj+1] -= L->value[i][4*j+1]*U->value[i][4*ii+0];
1366 w[4*jj+1] -= L->value[i][4*j+3]*U->value[i][4*ii+1];
1367 w[4*jj+2] -= L->value[i][4*j+0]*U->value[i][4*ii+2];
1368 w[4*jj+2] -= L->value[i][4*j+2]*U->value[i][4*ii+3];
1369 w[4*jj+3] -= L->value[i][4*j+1]*U->value[i][4*ii+2];
1370 w[4*jj+3] -= L->value[i][4*j+3]*U->value[i][4*ii+3];*/
1371 break;
1372 case 3:
1373 w[9*jj+0] -= L->value[i][9*j+0]*U->value[i][9*ii+0] + L->value[i][9*j+3]*U->value[i][9*ii+1] + L->value[i][9*j+6]*U->value[i][9*ii+2];
1374 w[9*jj+1] -= L->value[i][9*j+1]*U->value[i][9*ii+0] + L->value[i][9*j+4]*U->value[i][9*ii+1] + L->value[i][9*j+7]*U->value[i][9*ii+2];
1375 w[9*jj+2] -= L->value[i][9*j+2]*U->value[i][9*ii+0] + L->value[i][9*j+5]*U->value[i][9*ii+1] + L->value[i][9*j+8]*U->value[i][9*ii+2];
1376 w[9*jj+3] -= L->value[i][9*j+0]*U->value[i][9*ii+3] + L->value[i][9*j+3]*U->value[i][9*ii+4] + L->value[i][9*j+6]*U->value[i][9*ii+5];
1377 w[9*jj+4] -= L->value[i][9*j+1]*U->value[i][9*ii+3] + L->value[i][9*j+4]*U->value[i][9*ii+4] + L->value[i][9*j+7]*U->value[i][9*ii+5];
1378 w[9*jj+5] -= L->value[i][9*j+2]*U->value[i][9*ii+3] + L->value[i][9*j+5]*U->value[i][9*ii+4] + L->value[i][9*j+8]*U->value[i][9*ii+5];
1379 w[9*jj+6] -= L->value[i][9*j+0]*U->value[i][9*ii+6] + L->value[i][9*j+3]*U->value[i][9*ii+7] + L->value[i][9*j+6]*U->value[i][9*ii+8];
1380 w[9*jj+7] -= L->value[i][9*j+1]*U->value[i][9*ii+6] + L->value[i][9*j+4]*U->value[i][9*ii+7] + L->value[i][9*j+7]*U->value[i][9*ii+8];
1381 w[9*jj+8] -= L->value[i][9*j+2]*U->value[i][9*ii+6] + L->value[i][9*j+5]*U->value[i][9*ii+7] + L->value[i][9*j+8]*U->value[i][9*ii+8];
1382
1383 /* w[9*jj+0] -= U->value[i][9*ii+0]*L->value[i][9*j+0] + U->value[i][9*ii+3]*L->value[i][9*j+1] + U->value[i][9*ii+6]*L->value[i][9*j+2];
1384 w[9*jj+1] -= U->value[i][9*ii+1]*L->value[i][9*j+0] + U->value[i][9*ii+4]*L->value[i][9*j+1] + U->value[i][9*ii+7]*L->value[i][9*j+2];
1385 w[9*jj+2] -= U->value[i][9*ii+2]*L->value[i][9*j+0] + U->value[i][9*ii+5]*L->value[i][9*j+1] + U->value[i][9*ii+8]*L->value[i][9*j+2];
1386 w[9*jj+3] -= U->value[i][9*ii+0]*L->value[i][9*j+3] + U->value[i][9*ii+3]*L->value[i][9*j+4] + U->value[i][9*ii+6]*L->value[i][9*j+5];
1387 w[9*jj+4] -= U->value[i][9*ii+1]*L->value[i][9*j+3] + U->value[i][9*ii+4]*L->value[i][9*j+4] + U->value[i][9*ii+7]*L->value[i][9*j+5];
1388 w[9*jj+5] -= U->value[i][9*ii+2]*L->value[i][9*j+3] + U->value[i][9*ii+5]*L->value[i][9*j+4] + U->value[i][9*ii+8]*L->value[i][9*j+5];
1389 w[9*jj+6] -= U->value[i][9*ii+0]*L->value[i][9*j+6] + U->value[i][9*ii+3]*L->value[i][9*j+7] + U->value[i][9*ii+6]*L->value[i][9*j+8];
1390 w[9*jj+7] -= U->value[i][9*ii+1]*L->value[i][9*j+6] + U->value[i][9*ii+4]*L->value[i][9*j+7] + U->value[i][9*ii+7]*L->value[i][9*j+8];
1391 w[9*jj+8] -= U->value[i][9*ii+2]*L->value[i][9*j+6] + U->value[i][9*ii+5]*L->value[i][9*j+7] + U->value[i][9*ii+8]*L->value[i][9*j+8];*/
1392 break;
1393 }
1394 }
1395 else
1396 {
1397 switch(bnr)
1398 {
1399 case 1:
1400 w[jj] = -val*L->value[i][j];
1401 break;
1402 case 2:
1403 w[4*jj+0] = -U->value[i][4*ii+0]*L->value[i][4*j+0] - U->value[i][4*ii+2]*L->value[i][4*j+1];
1404 w[4*jj+1] = -U->value[i][4*ii+1]*L->value[i][4*j+0] - U->value[i][4*ii+3]*L->value[i][4*j+1];
1405 w[4*jj+2] = -U->value[i][4*ii+0]*L->value[i][4*j+2] - U->value[i][4*ii+2]*L->value[i][4*j+3];
1406 w[4*jj+3] = -U->value[i][4*ii+1]*L->value[i][4*j+2] - U->value[i][4*ii+3]*L->value[i][4*j+3];
1407
1408 /* w[4*jj+0] = -L->value[i][4*j+0]*U->value[i][4*ii+0] - L->value[i][4*j+2]*U->value[i][4*ii+1];
1409 w[4*jj+1] = -L->value[i][4*j+1]*U->value[i][4*ii+0] - L->value[i][4*j+3]*U->value[i][4*ii+1];
1410 w[4*jj+2] = -L->value[i][4*j+0]*U->value[i][4*ii+2] - L->value[i][4*j+2]*U->value[i][4*ii+3];
1411 w[4*jj+3] = -L->value[i][4*j+1]*U->value[i][4*ii+2] - L->value[i][4*j+3]*U->value[i][4*ii+3];*/
1412 break;
1413 case 3:
1414 w[9*jj+0] = -L->value[i][9*j+0]*U->value[i][9*ii+0] - L->value[i][9*j+3]*U->value[i][9*ii+1] - L->value[i][9*j+6]*U->value[i][9*ii+2];
1415 w[9*jj+1] = -L->value[i][9*j+1]*U->value[i][9*ii+0] - L->value[i][9*j+4]*U->value[i][9*ii+1] - L->value[i][9*j+7]*U->value[i][9*ii+2];
1416 w[9*jj+2] = -L->value[i][9*j+2]*U->value[i][9*ii+0] - L->value[i][9*j+5]*U->value[i][9*ii+1] - L->value[i][9*j+8]*U->value[i][9*ii+2];
1417 w[9*jj+3] = -L->value[i][9*j+0]*U->value[i][9*ii+3] - L->value[i][9*j+3]*U->value[i][9*ii+4] - L->value[i][9*j+6]*U->value[i][9*ii+5];
1418 w[9*jj+4] = -L->value[i][9*j+1]*U->value[i][9*ii+3] - L->value[i][9*j+4]*U->value[i][9*ii+4] - L->value[i][9*j+7]*U->value[i][9*ii+5];
1419 w[9*jj+5] = -L->value[i][9*j+2]*U->value[i][9*ii+3] - L->value[i][9*j+5]*U->value[i][9*ii+4] - L->value[i][9*j+8]*U->value[i][9*ii+5];
1420 w[9*jj+6] = -L->value[i][9*j+0]*U->value[i][9*ii+6] - L->value[i][9*j+3]*U->value[i][9*ii+7] - L->value[i][9*j+6]*U->value[i][9*ii+8];
1421 w[9*jj+7] = -L->value[i][9*j+1]*U->value[i][9*ii+6] - L->value[i][9*j+4]*U->value[i][9*ii+7] - L->value[i][9*j+7]*U->value[i][9*ii+8];
1422 w[9*jj+8] = -L->value[i][9*j+2]*U->value[i][9*ii+6] - L->value[i][9*j+5]*U->value[i][9*ii+7] - L->value[i][9*j+8]*U->value[i][9*ii+8];
1423
1424 /* w[9*jj+0] = -U->value[i][9*ii+0]*L->value[i][9*j+0] - U->value[i][9*ii+3]*L->value[i][9*j+1] - U->value[i][9*ii+6]*L->value[i][9*j+2];
1425 w[9*jj+1] = -U->value[i][9*ii+1]*L->value[i][9*j+0] - U->value[i][9*ii+4]*L->value[i][9*j+1] - U->value[i][9*ii+7]*L->value[i][9*j+2];
1426 w[9*jj+2] = -U->value[i][9*ii+2]*L->value[i][9*j+0] - U->value[i][9*ii+5]*L->value[i][9*j+1] - U->value[i][9*ii+8]*L->value[i][9*j+2];
1427 w[9*jj+3] = -U->value[i][9*ii+0]*L->value[i][9*j+3] - U->value[i][9*ii+3]*L->value[i][9*j+4] - U->value[i][9*ii+6]*L->value[i][9*j+5];
1428 w[9*jj+4] = -U->value[i][9*ii+1]*L->value[i][9*j+3] - U->value[i][9*ii+4]*L->value[i][9*j+4] - U->value[i][9*ii+7]*L->value[i][9*j+5];
1429 w[9*jj+5] = -U->value[i][9*ii+2]*L->value[i][9*j+3] - U->value[i][9*ii+5]*L->value[i][9*j+4] - U->value[i][9*ii+8]*L->value[i][9*j+5];
1430 w[9*jj+6] = -U->value[i][9*ii+0]*L->value[i][9*j+6] - U->value[i][9*ii+3]*L->value[i][9*j+7] - U->value[i][9*ii+6]*L->value[i][9*j+8];
1431 w[9*jj+7] = -U->value[i][9*ii+1]*L->value[i][9*j+6] - U->value[i][9*ii+4]*L->value[i][9*j+7] - U->value[i][9*ii+7]*L->value[i][9*j+8];
1432 w[9*jj+8] = -U->value[i][9*ii+2]*L->value[i][9*j+6] - U->value[i][9*ii+5]*L->value[i][9*j+7] - U->value[i][9*ii+8]*L->value[i][9*j+8];*/
1433 break;
1434 }
1435 iw[cw++] = jj;
1436 wc[jj] = 1;
1437 }
1438 }
1439 ii++;
1440 zc[i] = ii;
1441 if( ii<U->nnz[i] )
1442 {
1443 kk = U->index[i][ii];
1444 ll = i;
1445 i = zl[ll];
1446 zl[ll] = zl[kk];
1447 zl[kk] = ll;
1448 }
1449 else
1450 {
1451 i = zl[i];
1452 }
1453 }
1454
1455 switch(bnr)
1456 {
1457 case 1:
1458 toldd = fabs(D->value[k])*tol;
1459 break;
1460 case 2:
1461 toldd = fabs(D->value[4*k+0]) + fabs(D->value[4*k+1]);
1462 toldd += fabs(D->value[4*k+2]) + fabs(D->value[4*k+3]);
1463 /* toldd = sqrt(toldd)*tol;*/
1464 toldd = toldd*tol;
1465 break;
1466 case 3:
1467 toldd = fabs(D->value[9*k+0]) + fabs(D->value[9*k+1]) + fabs(D->value[9*k+2]);
1468 toldd += fabs(D->value[9*k+3]) + fabs(D->value[9*k+4]) + fabs(D->value[9*k+5]);
1469 toldd += fabs(D->value[9*k+6]) + fabs(D->value[9*k+7]) + fabs(D->value[9*k+8]);
1470 /* toldd = sqrt(toldd)*tol;*/
1471 toldd = toldd*tol;
1472 break;
1473 }
1474 /* D->value[k] = 1.0/D->value[k];*/
1475 /* t = D->value[k];*/
1476
1477 #if 1
1478 memcpy(lu,&D->value[bs*k],bs*sizeof(LIS_SCALAR));
1479 for(kk=0;kk<bnr;kk++)
1480 {
1481 lu[kk*bnr+kk] = 1.0 / lu[kk*bnr+kk];
1482 for(ii=kk+1;ii<bnr;ii++)
1483 {
1484 a = lu[kk*bnr+ii] * lu[kk*bnr+kk];
1485 for(jj=kk+1;jj<bnr;jj++)
1486 {
1487 lu[jj*bnr+ii] -= a * lu[jj*bnr+kk];
1488 }
1489 lu[kk*bnr+ii] = a;
1490 }
1491 }
1492 memcpy(&D->value[bs*k],lu,bs*sizeof(LIS_SCALAR));
1493 switch(bnr)
1494 {
1495 case 1:
1496 lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1497 lis_printf(comm,"k=%d %e\n",k,D->value[k]);
1498 break;
1499 case 2:
1500 lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1501 lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+0],D->value[4*k+2]);
1502 lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+1],D->value[4*k+3]);
1503 break;
1504 case 3:
1505 lis_printf(comm,"k=%d toldd=%e\n",k,toldd);
1506 lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+0],D->value[9*k+3],D->value[9*k+6]);
1507 lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+1],D->value[9*k+4],D->value[9*k+7]);
1508 lis_printf(comm,"k=%d %e %e %e\n",k,D->value[9*k+2],D->value[9*k+5],D->value[9*k+8]);
1509 break;
1510 }
1511 #endif
1512 #if 0
1513 switch(bnr)
1514 {
1515 case 1:
1516 D->value[k] = 1.0/D->value[k];
1517 break;
1518 case 2:
1519 /*
1520 D[i] = |d0 d2|
1521 |d1 d3|
1522 */
1523 t = 1.0/(D->value[4*k]*D->value[4*k+3] - D->value[4*k+1]*D->value[4*k+2]);
1524
1525 a = D->value[4*k];
1526 lis_printf(comm,"k=%d t=%e\n",k,t);
1527 lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+0],D->value[4*k+2]);
1528 lis_printf(comm,"k=%d %e %e\n",k,D->value[4*k+1],D->value[4*k+3]);
1529 D->value[4*k] = t*D->value[4*k+3];
1530 D->value[4*k+1] = -t*D->value[4*k+1];
1531 D->value[4*k+2] = -t*D->value[4*k+2];
1532 D->value[4*k+3] = t*a;
1533 break;
1534 case 3:
1535 /*
1536 D[i] = |d0 d3 d6|
1537 |d1 d4 d7|
1538 |d2 d5 d8|
1539 */
1540 t = D->value[9*k]*D->value[9*k+4]*D->value[9*k+8] + D->value[9*k+1]*D->value[9*k+5]*D->value[9*k+6] + D->value[9*k+2]*D->value[9*k+3]*D->value[9*k+7];
1541 t -= D->value[9*k]*D->value[9*k+5]*D->value[9*k+7] + D->value[9*k+1]*D->value[9*k+3]*D->value[9*k+8] + D->value[9*k+2]*D->value[9*k+4]*D->value[9*k+6];
1542 lis_printf(comm,"k=%d t=%e\n",k,t);
1543 t = 1.0 / t;
1544 b[0] = t*(D->value[9*k+4]*D->value[9*k+8] - D->value[9*k+5]*D->value[9*k+7]);
1545 b[1] = t*(D->value[9*k+5]*D->value[9*k+6] - D->value[9*k+3]*D->value[9*k+8]);
1546 b[2] = t*(D->value[9*k+3]*D->value[9*k+7] - D->value[9*k+4]*D->value[9*k+6]);
1547 b[3] = t*(D->value[9*k+2]*D->value[9*k+7] - D->value[9*k+1]*D->value[9*k+8]);
1548 b[4] = t*(D->value[9*k+0]*D->value[9*k+8] - D->value[9*k+2]*D->value[9*k+6]);
1549 b[5] = t*(D->value[9*k+1]*D->value[9*k+6] - D->value[9*k+0]*D->value[9*k+7]);
1550 b[6] = t*(D->value[9*k+1]*D->value[9*k+5] - D->value[9*k+2]*D->value[9*k+4]);
1551 b[7] = t*(D->value[9*k+2]*D->value[9*k+3] - D->value[9*k+0]*D->value[9*k+5]);
1552 b[8] = t*(D->value[9*k+0]*D->value[9*k+4] - D->value[9*k+1]*D->value[9*k+3]);
1553 memcpy(&D->value[9*k],b,9*sizeof(LIS_SCALAR));
1554 /* lis_printf(comm,"k=%d t=%e\n",k,t);
1555 lis_printf(comm,"k=%d %e %e %e\n",k,b[0],b[3],b[6]);
1556 lis_printf(comm,"k=%d %e %e %e\n",k,b[1],b[4],b[7]);
1557 lis_printf(comm,"k=%d %e %e %e\n",k,b[2],b[5],b[8]);*/
1558 break;
1559 }
1560 memcpy(tt,&D->value[bs*k],bs*sizeof(LIS_SCALAR));
1561 #endif
1562 #if 1
1563 if( cz<cw )
1564 {
1565 for(j=0;j<cz;j++)
1566 {
1567 jj = iz[j];
1568 if( wc[jj] )
1569 {
1570 #if 1
1571 for(i=0;i<bnr;i++)
1572 {
1573 wz[i] = -w[bs*jj+i]*lu[0];
1574 for(ii=1;ii<bnr;ii++)
1575 {
1576 a = -w[bs*jj+ii*bnr+i];
1577 for(kk=0;kk<ii-1;kk++)
1578 {
1579 a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1580 }
1581 wz[ii*bnr+i] = a * lu[ii*bnr+ii];
1582 }
1583 }
1584 for(i=0;i<bnr;i++)
1585 {
1586 for(ii=bnr-1;ii>=0;ii--)
1587 {
1588 a = wz[ii*bnr+i];
1589 for(kk=ii+1;kk<bnr;kk++)
1590 {
1591 a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1592 }
1593 wz[ii*bnr+i] = a;
1594 }
1595 }
1596 #endif
1597 switch(bnr)
1598 {
1599 case 1:
1600 /* D->value[jj] -= z[jj]*w[jj];*/
1601 /* D->value[jj] -= z[jj]*w[jj]*tt[0];*/
1602 D->value[jj] -= z[jj]*wz[0];
1603 break;
1604 case 2:
1605 /* wz[0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1606 wz[1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1607 wz[2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1608 wz[3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];*/
1609 D->value[4*jj+0] -= z[4*jj+0]*wz[0] + z[4*jj+2]*wz[1];
1610 D->value[4*jj+1] -= z[4*jj+1]*wz[0] + z[4*jj+3]*wz[1];
1611 D->value[4*jj+2] -= z[4*jj+0]*wz[2] + z[4*jj+2]*wz[3];
1612 D->value[4*jj+3] -= z[4*jj+1]*wz[2] + z[4*jj+3]*wz[3];
1613
1614 /* D->value[4*jj+0] -= w[4*jj+0]*z[4*jj+0] + w[4*jj+2]*z[4*jj+1];
1615 D->value[4*jj+1] -= w[4*jj+1]*z[4*jj+0] + w[4*jj+3]*z[4*jj+1];
1616 D->value[4*jj+2] -= w[4*jj+0]*z[4*jj+2] + w[4*jj+2]*z[4*jj+3];
1617 D->value[4*jj+3] -= w[4*jj+1]*z[4*jj+2] + w[4*jj+3]*z[4*jj+3];*/
1618 break;
1619 case 3:
1620 /* wz[0] = tt[0]*z[9*jj+0] + tt[3]*z[9*jj+1] + tt[6]*z[9*jj+2];
1621 wz[1] = tt[1]*z[9*jj+0] + tt[4]*z[9*jj+1] + tt[7]*z[9*jj+2];
1622 wz[2] = tt[2]*z[9*jj+0] + tt[5]*z[9*jj+1] + tt[8]*z[9*jj+2];
1623 wz[3] = tt[0]*z[9*jj+3] + tt[3]*z[9*jj+4] + tt[6]*z[9*jj+5];
1624 wz[4] = tt[1]*z[9*jj+3] + tt[4]*z[9*jj+4] + tt[7]*z[9*jj+5];
1625 wz[5] = tt[2]*z[9*jj+3] + tt[5]*z[9*jj+4] + tt[8]*z[9*jj+5];
1626 wz[6] = tt[0]*z[9*jj+6] + tt[3]*z[9*jj+7] + tt[6]*z[9*jj+8];
1627 wz[7] = tt[1]*z[9*jj+6] + tt[4]*z[9*jj+7] + tt[7]*z[9*jj+8];
1628 wz[8] = tt[2]*z[9*jj+6] + tt[5]*z[9*jj+7] + tt[8]*z[9*jj+8];
1629 */
1630 /* wz[0] = z[9*jj+0];
1631 wz[1] = z[9*jj+1];
1632 wz[2] = z[9*jj+2];
1633 wz[3] = z[9*jj+3];
1634 wz[4] = z[9*jj+4];
1635 wz[5] = z[9*jj+5];
1636 wz[6] = z[9*jj+6];
1637 wz[7] = z[9*jj+7];
1638 wz[8] = z[9*jj+8];*/
1639
1640 D->value[9*jj+0] -= w[9*jj+0]*wz[0] + w[9*jj+3]*wz[1] + w[9*jj+6]*wz[2];
1641 D->value[9*jj+1] -= w[9*jj+1]*wz[0] + w[9*jj+4]*wz[1] + w[9*jj+7]*wz[2];
1642 D->value[9*jj+2] -= w[9*jj+2]*wz[0] + w[9*jj+5]*wz[1] + w[9*jj+8]*wz[2];
1643 D->value[9*jj+3] -= w[9*jj+0]*wz[3] + w[9*jj+3]*wz[4] + w[9*jj+6]*wz[5];
1644 D->value[9*jj+4] -= w[9*jj+1]*wz[3] + w[9*jj+4]*wz[4] + w[9*jj+7]*wz[5];
1645 D->value[9*jj+5] -= w[9*jj+2]*wz[3] + w[9*jj+5]*wz[4] + w[9*jj+8]*wz[5];
1646 D->value[9*jj+6] -= w[9*jj+0]*wz[6] + w[9*jj+3]*wz[7] + w[9*jj+6]*wz[8];
1647 D->value[9*jj+7] -= w[9*jj+1]*wz[6] + w[9*jj+4]*wz[7] + w[9*jj+7]*wz[8];
1648 D->value[9*jj+8] -= w[9*jj+2]*wz[6] + w[9*jj+5]*wz[7] + w[9*jj+8]*wz[8];
1649 break;
1650 }
1651 }
1652 }
1653 }
1654 else
1655 {
1656 for(j=0;j<cw;j++)
1657 {
1658 jj = iw[j];
1659 if( zc[jj] )
1660 {
1661 #if 1
1662 for(i=0;i<bnr;i++)
1663 {
1664 wz[i] = -w[bs*jj+i]*lu[0];
1665 for(ii=1;ii<bnr;ii++)
1666 {
1667 a = -w[bs*jj+ii*bnr+i];
1668 for(kk=0;kk<ii-1;kk++)
1669 {
1670 a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1671 }
1672 wz[ii*bnr+i] = a * lu[ii*bnr+ii];
1673 }
1674 }
1675 for(i=0;i<bnr;i++)
1676 {
1677 for(ii=bnr-1;ii>=0;ii--)
1678 {
1679 a = wz[ii*bnr+i];
1680 for(kk=ii+1;kk<bnr;kk++)
1681 {
1682 a -= wz[kk*bnr+i] * lu[ii*bnr+kk];
1683 }
1684 wz[ii*bnr+i] = a;
1685 }
1686 }
1687 #endif
1688 switch(bnr)
1689 {
1690 case 1:
1691 /* D->value[jj] -= z[jj]*w[jj];
1692 D->value[jj] -= z[jj]*w[jj]*tt[0];*/
1693 D->value[jj] -= z[jj]*wz[0];
1694 break;
1695 case 2:
1696 /* wz[0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1697 wz[1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1698 wz[2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1699 wz[3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];
1700 */
1701 D->value[4*jj+0] -= z[4*jj+0]*wz[0] + z[4*jj+2]*wz[1];
1702 D->value[4*jj+1] -= z[4*jj+1]*wz[0] + z[4*jj+3]*wz[1];
1703 D->value[4*jj+2] -= z[4*jj+0]*wz[2] + z[4*jj+2]*wz[3];
1704 D->value[4*jj+3] -= z[4*jj+1]*wz[2] + z[4*jj+3]*wz[3];
1705
1706 /* D->value[4*jj+0] -= w[4*jj+0]*z[4*jj+0] + w[4*jj+2]*z[4*jj+1];
1707 D->value[4*jj+1] -= w[4*jj+1]*z[4*jj+0] + w[4*jj+3]*z[4*jj+1];
1708 D->value[4*jj+2] -= w[4*jj+0]*z[4*jj+2] + w[4*jj+2]*z[4*jj+3];
1709 D->value[4*jj+3] -= w[4*jj+1]*z[4*jj+2] + w[4*jj+3]*z[4*jj+3];*/
1710 break;
1711 case 3:
1712 /* wz[0] = tt[0]*z[9*jj+0] + tt[3]*z[9*jj+1] + tt[6]*z[9*jj+2];
1713 wz[1] = tt[1]*z[9*jj+0] + tt[4]*z[9*jj+1] + tt[7]*z[9*jj+2];
1714 wz[2] = tt[2]*z[9*jj+0] + tt[5]*z[9*jj+1] + tt[8]*z[9*jj+2];
1715 wz[3] = tt[0]*z[9*jj+3] + tt[3]*z[9*jj+4] + tt[6]*z[9*jj+5];
1716 wz[4] = tt[1]*z[9*jj+3] + tt[4]*z[9*jj+4] + tt[7]*z[9*jj+5];
1717 wz[5] = tt[2]*z[9*jj+3] + tt[5]*z[9*jj+4] + tt[8]*z[9*jj+5];
1718 wz[6] = tt[0]*z[9*jj+6] + tt[3]*z[9*jj+7] + tt[6]*z[9*jj+8];
1719 wz[7] = tt[1]*z[9*jj+6] + tt[4]*z[9*jj+7] + tt[7]*z[9*jj+8];
1720 wz[8] = tt[2]*z[9*jj+6] + tt[5]*z[9*jj+7] + tt[8]*z[9*jj+8];
1721 */
1722 /* wz[0] = z[9*jj+0];
1723 wz[1] = z[9*jj+1];
1724 wz[2] = z[9*jj+2];
1725 wz[3] = z[9*jj+3];
1726 wz[4] = z[9*jj+4];
1727 wz[5] = z[9*jj+5];
1728 wz[6] = z[9*jj+6];
1729 wz[7] = z[9*jj+7];
1730 wz[8] = z[9*jj+8];*/
1731
1732 D->value[9*jj+0] -= w[9*jj+0]*wz[0] + w[9*jj+3]*wz[1] + w[9*jj+6]*wz[2];
1733 D->value[9*jj+1] -= w[9*jj+1]*wz[0] + w[9*jj+4]*wz[1] + w[9*jj+7]*wz[2];
1734 D->value[9*jj+2] -= w[9*jj+2]*wz[0] + w[9*jj+5]*wz[1] + w[9*jj+8]*wz[2];
1735 D->value[9*jj+3] -= w[9*jj+0]*wz[3] + w[9*jj+3]*wz[4] + w[9*jj+6]*wz[5];
1736 D->value[9*jj+4] -= w[9*jj+1]*wz[3] + w[9*jj+4]*wz[4] + w[9*jj+7]*wz[5];
1737 D->value[9*jj+5] -= w[9*jj+2]*wz[3] + w[9*jj+5]*wz[4] + w[9*jj+8]*wz[5];
1738 D->value[9*jj+6] -= w[9*jj+0]*wz[6] + w[9*jj+3]*wz[7] + w[9*jj+6]*wz[8];
1739 D->value[9*jj+7] -= w[9*jj+1]*wz[6] + w[9*jj+4]*wz[7] + w[9*jj+7]*wz[8];
1740 D->value[9*jj+8] -= w[9*jj+2]*wz[6] + w[9*jj+5]*wz[7] + w[9*jj+8]*wz[8];
1741 break;
1742 }
1743 }
1744 }
1745 }
1746 #endif
1747 /* drop U */
1748 nnz = 0;
1749 for(j=0;j<cz;j++)
1750 {
1751 jj = iz[j];
1752 switch(bnr)
1753 {
1754 case 1:
1755 t = fabs(z[jj]);
1756 break;
1757 case 2:
1758 t = fabs(z[4*jj+0]) + fabs(z[4*jj+1]);
1759 t += fabs(z[4*jj+2]) + fabs(z[4*jj+3]);
1760 /* t = sqrt(t);*/
1761 break;
1762 case 3:
1763 t = fabs(z[9*jj+0]) + fabs(z[9*jj+1]) + fabs(z[9*jj+2]);
1764 t += fabs(z[9*jj+3]) + fabs(z[9*jj+4]) + fabs(z[9*jj+5]);
1765 t += fabs(z[9*jj+6]) + fabs(z[9*jj+7]) + fabs(z[9*jj+8]);
1766 /* t = sqrt(t);*/
1767 break;
1768 }
1769 if( t>toldd )
1770 {
1771 iz[nnz++] = jj;
1772 tmp[jj] = t;
1773 }
1774 else
1775 {
1776 switch(bnr)
1777 {
1778 case 1:
1779 z[k] = z[k] + fabs(z[jj]);
1780 break;
1781 case 2:
1782 z[4*k+0] += fabs(z[4*jj+0]);
1783 z[4*k+1] += fabs(z[4*jj+1]);
1784 z[4*k+1] += fabs(z[4*jj+2]);
1785 z[4*k+1] += fabs(z[4*jj+3]);
1786 break;
1787 case 3:
1788 z[9*k+0] += fabs(z[9*jj+0]);
1789 z[9*k+1] += fabs(z[9*jj+1]);
1790 z[9*k+2] += fabs(z[9*jj+2]);
1791 z[9*k+3] += fabs(z[9*jj+3]);
1792 z[9*k+4] += fabs(z[9*jj+4]);
1793 z[9*k+5] += fabs(z[9*jj+5]);
1794 z[9*k+6] += fabs(z[9*jj+6]);
1795 z[9*k+7] += fabs(z[9*jj+7]);
1796 z[9*k+8] += fabs(z[9*jj+8]);
1797 break;
1798 }
1799 zc[jj] = 0;
1800 tmp[jj] = 0.0;
1801 }
1802 }
1803 len = _min(lfil,nnz);
1804 lis_sort_di(0,nnz-1,tmp,iz);
1805 lis_sort_i(0,len-1,iz);
1806 U->nnz[k] = len;
1807 if( len>0 )
1808 {
1809 U->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
1810 U->value[k] = (LIS_SCALAR *)malloc(bs*len*sizeof(LIS_SCALAR));
1811 }
1812 for(j=0;j<len;j++)
1813 {
1814 jj = iz[j];
1815 U->index[k][j] = jj;
1816 /* U->value[k][j] = z[jj];*/
1817 memcpy(&U->value[k][bs*j],&z[bs*jj],bs*sizeof(LIS_SCALAR));
1818 /* for(i=0;i<bnr;i++)
1819 {
1820 lis_printf(comm,"U:k=%d j=%d",k,jj);
1821 for(ii=0;ii<bnr;ii++)
1822 {
1823 lis_printf(comm," %e",z[bs*jj+ii*bnr+i]);
1824 }
1825 lis_printf(comm,"\n");
1826 }*/
1827 }
1828 for(j=len;j<cz;j++) zc[iz[j]] = 0;
1829 cz = nnz;
1830 /* lis_printf(comm,"k=%d U:lfil=%d nnz=%d len=%d\n",k,lfil,nnz,len);*/
1831
1832 /* drop L */
1833 nnz = 0;
1834 for(j=0;j<cw;j++)
1835 {
1836 jj = iw[j];
1837 switch(bnr)
1838 {
1839 case 1:
1840 t = fabs(w[jj]);
1841 break;
1842 case 2:
1843 t = fabs(w[4*jj+0]) + fabs(w[4*jj+1]);
1844 t += fabs(w[4*jj+2]) + fabs(w[4*jj+3]);
1845 /* t = sqrt(t);*/
1846 break;
1847 case 3:
1848 t = fabs(w[9*jj+0]) + fabs(w[9*jj+1]) + fabs(w[9*jj+2]);
1849 t += fabs(w[9*jj+3]) + fabs(w[9*jj+4]) + fabs(w[9*jj+5]);
1850 t += fabs(w[9*jj+6]) + fabs(w[9*jj+7]) + fabs(w[9*jj+8]);
1851 /* t = sqrt(t);*/
1852 break;
1853 }
1854 /* lis_printf(comm,"k=%d L:jj=%d t=%e tol=%e\n",k,jj,t,toldd);*/
1855 if( t>toldd )
1856 {
1857 iw[nnz++] = jj;
1858 tmp[jj] = t;
1859 }
1860 else
1861 {
1862 wc[jj] = 0;
1863 tmp[jj] = 0.0;
1864 }
1865 }
1866 len = _min(lfil,nnz);
1867 lis_sort_di(0,nnz-1,tmp,iw);
1868 lis_sort_i(0,len-1,iw);
1869 L->nnz[k] = len;
1870 if( len>0 )
1871 {
1872 L->index[k] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
1873 L->value[k] = (LIS_SCALAR *)malloc(bs*len*sizeof(LIS_SCALAR));
1874 }
1875 for(j=0;j<len;j++)
1876 {
1877 jj = iw[j];
1878 L->index[k][j] = jj;
1879
1880 #if 1
1881 for(i=0;i<bnr;i++)
1882 {
1883 L->value[k][bs*j+i] = -w[bs*jj+i]*lu[0];
1884 for(ii=1;ii<bnr;ii++)
1885 {
1886 a = -w[bs*jj+ii*bnr+i];
1887 for(kk=0;kk<ii-1;kk++)
1888 {
1889 a -= L->value[k][bs*j+kk*bnr+i] * lu[ii*bnr+kk];
1890 }
1891 L->value[k][bs*j+ii*bnr+i] = a * lu[ii*bnr+ii];
1892 }
1893 }
1894 for(i=0;i<bnr;i++)
1895 {
1896 for(ii=bnr-1;ii>=0;ii--)
1897 {
1898 a = L->value[k][bs*j+ii*bnr+i];
1899 for(kk=ii+1;kk<bnr;kk++)
1900 {
1901 a -= L->value[k][bs*j+kk*bnr+i] * lu[ii*bnr+kk];
1902 }
1903 L->value[k][bs*j+ii*bnr+i] = a;
1904 }
1905 }
1906 #endif
1907 #if 0
1908 switch(bnr)
1909 {
1910 case 1:
1911 L->value[k][j] = w[jj]*tt[0];
1912 break;
1913 case 2:
1914 L->value[k][4*j+0] = w[4*jj+0]*tt[0] + w[4*jj+2]*tt[1];
1915 L->value[k][4*j+1] = w[4*jj+1]*tt[0] + w[4*jj+3]*tt[1];
1916 L->value[k][4*j+2] = w[4*jj+0]*tt[2] + w[4*jj+2]*tt[3];
1917 L->value[k][4*j+3] = w[4*jj+1]*tt[2] + w[4*jj+3]*tt[3];
1918 break;
1919 case 3:
1920 L->value[k][9*j+0] = w[9*jj+0]*tt[0] + w[9*jj+3]*tt[1] + w[9*jj+6]*tt[2];
1921 L->value[k][9*j+1] = w[9*jj+1]*tt[0] + w[9*jj+4]*tt[1] + w[9*jj+7]*tt[2];
1922 L->value[k][9*j+2] = w[9*jj+2]*tt[0] + w[9*jj+5]*tt[1] + w[9*jj+8]*tt[2];
1923 L->value[k][9*j+3] = w[9*jj+0]*tt[3] + w[9*jj+3]*tt[4] + w[9*jj+6]*tt[5];
1924 L->value[k][9*j+4] = w[9*jj+1]*tt[3] + w[9*jj+4]*tt[4] + w[9*jj+7]*tt[5];
1925 L->value[k][9*j+5] = w[9*jj+2]*tt[3] + w[9*jj+5]*tt[4] + w[9*jj+8]*tt[5];
1926 L->value[k][9*j+6] = w[9*jj+0]*tt[6] + w[9*jj+3]*tt[7] + w[9*jj+6]*tt[8];
1927 L->value[k][9*j+7] = w[9*jj+1]*tt[6] + w[9*jj+4]*tt[7] + w[9*jj+7]*tt[8];
1928 L->value[k][9*j+8] = w[9*jj+2]*tt[6] + w[9*jj+5]*tt[7] + w[9*jj+8]*tt[8];
1929 break;
1930 }
1931 /* for(i=0;i<bnr;i++)
1932 {
1933 lis_printf(comm,"L:k=%d j=%d",k,jj);
1934 for(ii=0;ii<bnr;ii++)
1935 {
1936 lis_printf(comm," %e",L->value[k][bs*j+ii*bnr+i]);
1937 }
1938 lis_printf(comm,"\n");
1939 }*/
1940 #endif
1941 }
1942 for(j=len;j<cw;j++) wc[iw[j]] = 0;
1943 cw = nnz;
1944 /* lis_printf(comm,"k=%d L:lfil=%d nnz=%d len=%d\n",k,lfil,nnz,len);*/
1945
1946
1947 /**/
1948 for(j=0;j<cz;j++) zc[iz[j]] = 0;
1949 for(j=0;j<cw;j++) wc[iw[j]] = 0;
1950 if(U->nnz[k]>0)
1951 {
1952 jj = iz[0];
1953 zc[k] = 0;
1954 zl[k] = zl[jj];
1955 zl[jj] = k;
1956 }
1957 if(L->nnz[k]>0)
1958 {
1959 jj = iw[0];
1960 wc[k] = 0;
1961 wl[k] = wl[jj];
1962 wl[jj] = k;
1963 }
1964 }
1965
1966 precon->L = L;
1967 precon->U = U;
1968 precon->WD = D;
1969
1970 lis_free2(12,tmp,w,z,iw,wc,wl,iz,zc,zl,ptr,index,value);
1971
1972 LIS_DEBUG_FUNC_OUT;
1973 return LIS_SUCCESS;
1974 }
1975
1976
1977
1978 #undef __FUNC__
1979 #define __FUNC__ "lis_psolve_iluc"
lis_psolve_iluc(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)1980 LIS_INT lis_psolve_iluc(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
1981 {
1982 #ifdef _OPENMP
1983 LIS_INT i,j,jj,n;
1984 LIS_INT nprocs,my_rank,is,ie;
1985 LIS_SCALAR w;
1986 LIS_SCALAR *x;
1987 LIS_MATRIX_ILU L,U;
1988 LIS_VECTOR D;
1989 LIS_PRECON precon;
1990 LIS_QUAD_DECLAR;
1991 #ifdef USE_QUAD_PRECISION
1992 LIS_QUAD w1;
1993 LIS_SCALAR *xl;
1994 #endif
1995
1996
1997 LIS_DEBUG_FUNC_IN;
1998
1999 precon = solver->precon;
2000 L = precon->L;
2001 U = precon->U;
2002 D = precon->D;
2003 n = L->n;
2004 x = X->value;
2005 #ifdef USE_QUAD_PRECISION
2006 xl = X->value_lo;
2007 #endif
2008
2009 #ifdef USE_QUAD_PRECISION
2010 if( B->precision==LIS_PRECISION_DEFAULT )
2011 {
2012 #endif
2013 lis_vector_copy(B,X);
2014 nprocs = omp_get_max_threads();
2015 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w)
2016 {
2017 my_rank = omp_get_thread_num();
2018 LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2019
2020 for(i=is; i<ie; i++)
2021 {
2022 for(j=0;j<L->nnz[i];j++)
2023 {
2024 jj = L->index[i][j];
2025 x[jj] -= L->value[i][j] * x[i];
2026 }
2027 }
2028 for(i=ie-1; i>=is; i--)
2029 {
2030 w = x[i];
2031 for(j=0;j<U->nnz[i];j++)
2032 {
2033 jj = U->index[i][j];
2034 w -= U->value[i][j] * x[jj];
2035 }
2036 x[i] = w * D->value[i];
2037 }
2038 }
2039 #ifdef USE_QUAD_PRECISION
2040 }
2041 else
2042 {
2043 lis_vector_copyex_mm(B,X);
2044 nprocs = omp_get_max_threads();
2045 #ifndef USE_SSE2
2046 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
2047 #else
2048 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
2049 #endif
2050 {
2051 my_rank = omp_get_thread_num();
2052 LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2053 for(i=is; i<ie; i++)
2054 {
2055 for(j=0;j<L->nnz[i];j++)
2056 {
2057 jj = L->index[i][j];
2058 /* if( jj>=is && jj<ie )*/
2059 {
2060 #ifndef USE_SSE2
2061 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2062 #else
2063 LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2064 #endif
2065 }
2066 }
2067 }
2068 for(i=ie-1; i>=is; i--)
2069 {
2070 w1.hi = x[i];
2071 w1.lo = xl[i];
2072 for(j=0;j<U->nnz[i];j++)
2073 {
2074 jj = U->index[i][j];
2075 #ifndef USE_SSE2
2076 LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2077 #else
2078 LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2079 #endif
2080 }
2081 #ifndef USE_SSE2
2082 LIS_QUAD_MULD(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2083 #else
2084 LIS_QUAD_MULD_SSE2(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2085 #endif
2086 }
2087 }
2088 }
2089 #endif
2090
2091 LIS_DEBUG_FUNC_OUT;
2092 return LIS_SUCCESS;
2093
2094 #else
2095 LIS_INT i,j,jj,n;
2096 LIS_SCALAR w;
2097 LIS_SCALAR *b,*x;
2098 LIS_MATRIX_ILU L,U;
2099 LIS_VECTOR D;
2100 LIS_PRECON precon;
2101 LIS_QUAD_DECLAR;
2102 #ifdef USE_QUAD_PRECISION
2103 LIS_QUAD w1;
2104 LIS_SCALAR *xl;
2105 #endif
2106
2107 /*
2108 * LUx = b
2109 * LU = (D + L*A) * (I + D^-1 * U*A)
2110 */
2111
2112 LIS_DEBUG_FUNC_IN;
2113
2114 precon = solver->precon;
2115 L = precon->L;
2116 U = precon->U;
2117 D = precon->D;
2118 n = L->n;
2119 b = B->value;
2120 x = X->value;
2121 #ifdef USE_QUAD_PRECISION
2122 xl = X->value_lo;
2123 #endif
2124
2125 #ifdef USE_QUAD_PRECISION
2126 if( B->precision==LIS_PRECISION_DEFAULT )
2127 {
2128 #endif
2129 lis_vector_copy(B,X);
2130 for(i=0; i<n; i++)
2131 {
2132 for(j=0;j<L->nnz[i];j++)
2133 {
2134 jj = L->index[i][j];
2135 x[jj] -= L->value[i][j] * x[i];
2136 }
2137 /* x[i] = x[i] * L->work[i];*/
2138 }
2139 for(i=n-1; i>=0; i--)
2140 {
2141 w = x[i];
2142 for(j=0;j<U->nnz[i];j++)
2143 {
2144 jj = U->index[i][j];
2145 w -= U->value[i][j] * x[jj];
2146 }
2147 x[i] = w * D->value[i];
2148 }
2149 #ifdef USE_QUAD_PRECISION
2150 }
2151 else
2152 {
2153 lis_vector_copyex_mm(B,X);
2154 for(i=0; i<n; i++)
2155 {
2156 for(j=0;j<L->nnz[i];j++)
2157 {
2158 jj = L->index[i][j];
2159 #ifndef USE_SSE2
2160 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2161 #else
2162 LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
2163 #endif
2164 /* x[jj] -= L->value[i][j] * x[i]; */
2165 }
2166 }
2167 for(i=n-1; i>=0; i--)
2168 {
2169 w1.hi = x[i];
2170 w1.lo = xl[i];
2171 /* t = x[i]; */
2172 for(j=0;j<U->nnz[i];j++)
2173 {
2174 jj = U->index[i][j];
2175 #ifndef USE_SSE2
2176 LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2177 #else
2178 LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-U->value[i][j]);
2179 #endif
2180 /* t -= U->value[i][j] * x[jj]; */
2181 }
2182 #ifndef USE_SSE2
2183 LIS_QUAD_MULD(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2184 #else
2185 LIS_QUAD_MULD_SSE2(x[i],xl[i],w1.hi,w1.lo,D->value[i]);
2186 #endif
2187 /* x[i] = t * D->value[i]; */
2188 }
2189 }
2190 #endif
2191
2192 LIS_DEBUG_FUNC_OUT;
2193 return LIS_SUCCESS;
2194
2195 #endif
2196 }
2197
2198 #undef __FUNC__
2199 #define __FUNC__ "lis_psolve_iluc_bsr"
lis_psolve_iluc_bsr(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)2200 LIS_INT lis_psolve_iluc_bsr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
2201 {
2202 LIS_INT i,j,jj,nr,bnr,ii,bs;
2203 LIS_SCALAR w[9],t;
2204 LIS_SCALAR *x;
2205 LIS_MATRIX_ILU L,U;
2206 LIS_MATRIX_DIAG D;
2207 LIS_PRECON precon;
2208
2209 /*
2210 * LUx = b
2211 * LU = (D + L*A) * (I + D^-1 * U*A)
2212 */
2213
2214 LIS_DEBUG_FUNC_IN;
2215
2216 precon = solver->precon;
2217 L = precon->L;
2218 U = precon->U;
2219 D = precon->WD;
2220 x = X->value;
2221 nr = solver->A->nr;
2222 bnr = solver->A->bnr;
2223 bs = bnr*bnr;
2224
2225 lis_vector_copy(B,X);
2226 for(i=0; i<nr; i++)
2227 {
2228 for(j=0;j<L->nnz[i];j++)
2229 {
2230 jj = L->index[i][j];
2231 /* memcpy(w,&x[bnr*i],bnr*sizeof(LIS_SCALAR));*/
2232 switch(bnr)
2233 {
2234 case 1:
2235 x[jj] -= L->value[i][j] * x[i];
2236 break;
2237 case 2:
2238 x[2*jj+0] -= L->value[i][4*j] * x[2*i+0];
2239 x[2*jj+0] -= L->value[i][4*j+2] * x[2*i+1];
2240 x[2*jj+1] -= L->value[i][4*j+1] * x[2*i+0];
2241 x[2*jj+1] -= L->value[i][4*j+3] * x[2*i+1];
2242 break;
2243 case 3:
2244 x[3*jj] -= L->value[i][9*j] * x[3*i+0] + L->value[i][9*j+3] * x[3*i+1] + L->value[i][9*j+6] * x[3*i+2];
2245 x[3*jj+1] -= L->value[i][9*j+1] * x[3*i+0] + L->value[i][9*j+4] * x[3*i+1] + L->value[i][9*j+7] * x[3*i+2];
2246 x[3*jj+2] -= L->value[i][9*j+2] * x[3*i+0] + L->value[i][9*j+5] * x[3*i+1] + L->value[i][9*j+8] * x[3*i+2];
2247 break;
2248 }
2249 }
2250 }
2251 for(i=nr-1; i>=0; i--)
2252 {
2253 /* w = x[i];*/
2254 memcpy(w,&x[bnr*i],bnr*sizeof(LIS_SCALAR));
2255 for(j=0;j<U->nnz[i];j++)
2256 {
2257 jj = U->index[i][j];
2258 switch(bnr)
2259 {
2260 case 1:
2261 w[0] -= U->value[i][j] * x[jj];
2262 break;
2263 case 2:
2264 w[0] -= U->value[i][4*j] * x[2*jj];
2265 w[0] -= U->value[i][4*j+2] * x[2*jj+1];
2266 w[1] -= U->value[i][4*j+1] * x[2*jj];
2267 w[1] -= U->value[i][4*j+3] * x[2*jj+1];
2268 break;
2269 case 3:
2270 w[0] -= U->value[i][9*j] * x[3*jj] + U->value[i][9*j+3] * x[3*jj+1] + U->value[i][9*j+6] * x[3*jj+2];
2271 w[1] -= U->value[i][9*j+1] * x[3*jj] + U->value[i][9*j+4] * x[3*jj+1] + U->value[i][9*j+7] * x[3*jj+2];
2272 w[2] -= U->value[i][9*j+2] * x[3*jj] + U->value[i][9*j+5] * x[3*jj+1] + U->value[i][9*j+8] * x[3*jj+2];
2273 break;
2274 }
2275 }
2276 #if 0
2277 switch(bnr)
2278 {
2279 case 1:
2280 x[i] = w[0] * D->value[i];
2281 break;
2282 case 2:
2283 x[2*i+0] = D->value[4*i] * w[0] + D->value[4*i+2] * w[1];
2284 x[2*i+1] = D->value[4*i+1] * w[0] + D->value[4*i+3] * w[1];
2285 break;
2286 case 3:
2287 x[3*i+0] = D->value[9*i] * w[0] + D->value[9*i+3] * w[1] + D->value[9*i+6] * w[2];
2288 x[3*i+1] = D->value[9*i+1] * w[0] + D->value[9*i+4] * w[1] + D->value[9*i+7] * w[2];
2289 x[3*i+2] = D->value[9*i+2] * w[0] + D->value[9*i+5] * w[1] + D->value[9*i+8] * w[2];
2290 break;
2291 }
2292 #else
2293 for(ii=0;ii<bnr;ii++)
2294 {
2295 t = w[ii];
2296 for(jj=0;jj<ii;jj++)
2297 {
2298 t -= D->value[bs*i+jj*bnr+ii] * x[bnr*i+jj];
2299 }
2300 x[bnr*i+ii] = t;
2301 }
2302 for(ii=bnr-1;ii>=0;ii--)
2303 {
2304 t = x[bnr*i+ii];
2305 for(jj=ii+1;jj<bnr;jj++)
2306 {
2307 t -= D->value[bs*i+jj*bnr+ii] * x[bnr*i+jj];
2308 }
2309 x[bnr*i+ii] = t * D->value[bs*i+ii*bnr+ii];
2310 }
2311 #endif
2312 }
2313
2314 LIS_DEBUG_FUNC_OUT;
2315 return LIS_SUCCESS;
2316 }
2317
2318 #undef __FUNC__
2319 #define __FUNC__ "lis_psolveh_iluc"
lis_psolveh_iluc(LIS_SOLVER solver,LIS_VECTOR B,LIS_VECTOR X)2320 LIS_INT lis_psolveh_iluc(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
2321 {
2322 #ifdef _OPENMP
2323 LIS_INT i,j,jj,n,is,ie,my_rank,nprocs;
2324 LIS_SCALAR w;
2325 LIS_SCALAR *x;
2326 LIS_MATRIX_ILU L,U;
2327 LIS_VECTOR D;
2328 LIS_PRECON precon;
2329 LIS_QUAD_DECLAR;
2330 #ifdef USE_QUAD_PRECISION
2331 LIS_QUAD w1;
2332 LIS_SCALAR *xl;
2333 #endif
2334
2335 /*
2336 * (LU)'x = b
2337 * U'x=b
2338 * L'x=b
2339 */
2340
2341
2342 LIS_DEBUG_FUNC_IN;
2343
2344 precon = solver->precon;
2345 L = precon->L;
2346 U = precon->U;
2347 D = precon->D;
2348 n = L->n;
2349 x = X->value;
2350 #ifdef USE_QUAD_PRECISION
2351 xl = X->value_lo;
2352 #endif
2353
2354 #ifdef USE_QUAD_PRECISION
2355 if( B->precision==LIS_PRECISION_DEFAULT )
2356 {
2357 #endif
2358 lis_vector_copy(B,X);
2359 nprocs = omp_get_max_threads();
2360 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w)
2361 {
2362 my_rank = omp_get_thread_num();
2363 LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2364 for(i=is; i<ie; i++)
2365 {
2366 x[i] = x[i] * D->value[i];
2367 for(j=0;j<U->nnz[i];j++)
2368 {
2369 jj = U->index[i][j];
2370 x[jj] -= conj(U->value[i][j]) * x[i];
2371 }
2372 }
2373 for(i=ie-2; i>=is; i--)
2374 {
2375 w = x[i];
2376 for(j=0;j<L->nnz[i];j++)
2377 {
2378 jj = L->index[i][j];
2379 w -= conj(L->value[i][j]) * x[jj];
2380 }
2381 x[i] = w;
2382 }
2383 }
2384 #ifdef USE_QUAD_PRECISION
2385 }
2386 else
2387 {
2388 lis_vector_copyex_mm(B,X);
2389 nprocs = omp_get_max_threads();
2390 #ifndef USE_SSE2
2391 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
2392 #else
2393 #pragma omp parallel private(i,j,jj,is,ie,my_rank,w1,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
2394 #endif
2395 {
2396 my_rank = omp_get_thread_num();
2397 LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
2398 for(i=is; i<ie; i++)
2399 {
2400 #ifndef USE_SSE2
2401 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
2402 #else
2403 LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],conj(D->value[i]));
2404 #endif
2405 /* x[i] = x[i] * conj(D->value[i]); */
2406 for(j=0;j<U->nnz[i];j++)
2407 {
2408 jj = U->index[i][j];
2409 #ifndef USE_SSE2
2410 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2411 #else
2412 LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2413 #endif
2414 /* x[jj] -= conj(U->value[i][j]) * x[i]; */
2415 }
2416 }
2417 for(i=ie-2; i>=is; i--)
2418 {
2419 w1.hi = x[i];
2420 w1.lo = xl[i];
2421 /* t = x[i]; */
2422 for(j=0;j<L->nnz[i];j++)
2423 {
2424 jj = L->index[i][j];
2425 #ifndef USE_SSE2
2426 LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2427 #else
2428 LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2429 #endif
2430 /* t -= conj(L->value[i][j]) * x[jj]; */
2431 }
2432 x[i] = w1.hi;
2433 xl[i] = w1.lo;
2434 /* x[i] = t; */
2435 }
2436 }
2437 }
2438 #endif
2439
2440 LIS_DEBUG_FUNC_OUT;
2441 return LIS_SUCCESS;
2442 #else
2443 LIS_INT i,j,jj,n;
2444 LIS_SCALAR w;
2445 LIS_SCALAR *b,*x;
2446 LIS_MATRIX_ILU L,U;
2447 LIS_VECTOR D;
2448 LIS_PRECON precon;
2449 LIS_QUAD_DECLAR;
2450 #ifdef USE_QUAD_PRECISION
2451 LIS_QUAD w1;
2452 LIS_SCALAR *xl;
2453 #endif
2454
2455 /*
2456 * (LU)'x = b
2457 * U'x=b
2458 * L'x=b
2459 */
2460
2461
2462 LIS_DEBUG_FUNC_IN;
2463
2464 precon = solver->precon;
2465 L = precon->L;
2466 U = precon->U;
2467 D = precon->D;
2468 n = L->n;
2469 b = B->value;
2470 x = X->value;
2471 #ifdef USE_QUAD_PRECISION
2472 xl = X->value_lo;
2473 #endif
2474
2475 #ifdef USE_QUAD_PRECISION
2476 if( B->precision==LIS_PRECISION_DEFAULT )
2477 {
2478 #endif
2479 lis_vector_copy(B,X);
2480 for(i=0; i<n; i++)
2481 {
2482 x[i] = x[i] * D->value[i];
2483 for(j=0;j<U->nnz[i];j++)
2484 {
2485 jj = U->index[i][j];
2486 x[jj] -= conj(U->value[i][j]) * x[i];
2487 }
2488 }
2489 for(i=n-2; i>=0; i--)
2490 {
2491 w = x[i];
2492 for(j=0;j<L->nnz[i];j++)
2493 {
2494 jj = L->index[i][j];
2495 w -= conj(L->value[i][j]) * x[jj];
2496 }
2497 x[i] = w;
2498 }
2499 #ifdef USE_QUAD_PRECISION
2500 }
2501 else
2502 {
2503 lis_vector_copyex_mm(B,X);
2504 for(i=0; i<n; i++)
2505 {
2506 #ifndef USE_SSE2
2507 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
2508 #else
2509 LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],conj(D->value[i]));
2510 #endif
2511 /* x[i] = x[i] * conj(D->value[i]); */
2512 for(j=0;j<U->nnz[i];j++)
2513 {
2514 jj = U->index[i][j];
2515 #ifndef USE_SSE2
2516 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2517 #else
2518 LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-conj(U->value[i][j]));
2519 #endif
2520 /* x[jj] -= conj(U->value[i][j]) * x[i]; */
2521 }
2522 }
2523 for(i=n-2; i>=0; i--)
2524 {
2525 w1.hi = x[i];
2526 w1.lo = xl[i];
2527 /* t = x[i]; */
2528 for(j=0;j<L->nnz[i];j++)
2529 {
2530 jj = L->index[i][j];
2531 #ifndef USE_SSE2
2532 LIS_QUAD_FMAD(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2533 #else
2534 LIS_QUAD_FMAD_SSE2(w1.hi,w1.lo,w1.hi,w1.lo,x[jj],xl[jj],-conj(L->value[i][j]));
2535 #endif
2536 /* t -= conj(L->value[i][j]) * x[jj]; */
2537 }
2538 x[i] = w1.hi;
2539 xl[i] = w1.lo;
2540 /* x[i] = t; */
2541 }
2542 }
2543 #endif
2544
2545 LIS_DEBUG_FUNC_OUT;
2546 return LIS_SUCCESS;
2547 #endif
2548 }
2549
2550