1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2 
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions are met:
5    1. Redistributions of source code must retain the above copyright
6       notice, this list of conditions and the following disclaimer.
7    2. Redistributions in binary form must reproduce the above copyright
8       notice, this list of conditions and the following disclaimer in the
9       documentation and/or other materials provided with the distribution.
10    3. Neither the name of the project nor the names of its contributors
11       may be used to endorse or promote products derived from this software
12       without specific prior written permission.
13 
14    THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18    PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24    POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 #ifdef HAVE_CONFIG_H
28 	#include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 	#include "lis_config_win.h"
32 #endif
33 #endif
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38         #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <stdarg.h>
42 #ifdef _OPENMP
43 	#include <omp.h>
44 #endif
45 #ifdef USE_MPI
46 	#include <mpi.h>
47 #endif
48 #include "lislib.h"
49 
50 /********************************************
51  * Preconditioned BiConjugate Gradient Safe *
52  ********************************************
53  r(0)    = b - Ax(0)
54  rtld(0) = conj(r(0)) or random
55  mr(0)   = M^-1 * r(0)
56  amr(0)  = A * mr(0)
57  p(0)    = mr(0)
58  ap(0)   = amr(0)
59  rho(0)  = <rtld,r(0)>
60  ********************************************
61  for k=1,2,...
62    tmpdot0 = <ap(k-1),rtld(0)>
63    alpha   = rho(k-1) / tmpdot0
64    tmpdot0 = <y(k-1),y(k-1)>
65    tmpdot1 = <amr(k-1),r(k-1)>
66    tmpdot2 = <y(k-1),r(k-1)>
67    tmpdot3 = <amr(k-1),y(k-1)>
68    tmpdot4 = <amr(k-1),amr(k-1)>
69    tmp     = tmpdot4*tmpdot0-tmpdot3*tmpdot3
70    qsi     = (tmpdot0*tmpdot1-tmpdot2*tmpdot3) / tmp
71    eta     = (tmpdot4*tmpdot2-tmpdot3*tmpdot1) / tmp
72    t(k-1)  = qsi*ap(k-1) + eta*y(k-1)
73    mt(k-1) = M^-1 * t(k-1)
74    u(k-1)  = mt(k-1) + eta*beta*u(k-2)
75    au(k-1) = A * u(k-1)
76    z(k-1)  = qsi*mr(k-1) + eta*z(k-2) - alpha*u(k-1)
77    y(k)    = qsi*amr(k-1) + eta*y(k-1) - alpha*au(k-1)
78    x(k)    = x(k-1) + alpha*p(k-1) + z(k-1)
79    r(k)    = r(k-1) - alpha*ap(k-1) - y(k)
80    mr(k)   = M^-1 * r(k)
81    amr(k)  = A * mr(k)
82    rho(k)  = <rtld,r(k)>
83    beta    = (rho(k) / rho(k-1)) * (alpha / qsi)
84    p(k)    = mr(k) + beta*(p(k-1) - u(k-1))
85    ap(k-1) = amr(k) + beta*(ap(k-1) - au(k-1))
86  ********************************************/
87 
88 #define NWORK 12
89 #undef __FUNC__
90 #define __FUNC__ "lis_bicgsafe_check_params"
lis_bicgsafe_check_params(LIS_SOLVER solver)91 LIS_INT lis_bicgsafe_check_params(LIS_SOLVER solver)
92 {
93 	LIS_DEBUG_FUNC_IN;
94 	LIS_DEBUG_FUNC_OUT;
95 	return LIS_SUCCESS;
96 }
97 
98 #undef __FUNC__
99 #define __FUNC__ "lis_bicgsafe_malloc_work"
lis_bicgsafe_malloc_work(LIS_SOLVER solver)100 LIS_INT lis_bicgsafe_malloc_work(LIS_SOLVER solver)
101 {
102 	LIS_VECTOR *work;
103 	LIS_INT	i,j,worklen,err;
104 
105 	LIS_DEBUG_FUNC_IN;
106 
107 	worklen = NWORK;
108 	work    = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_bicgsafe_malloc_work::work" );
109 	if( work==NULL )
110 	{
111 		LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR));
112 		return LIS_ERR_OUT_OF_MEMORY;
113 	}
114 	if( solver->precision==LIS_PRECISION_DEFAULT )
115 	{
116 		for(i=0;i<worklen;i++)
117 		{
118 			err = lis_vector_duplicate(solver->A,&work[i]);
119 			if( err ) break;
120 		}
121 	}
122 	else
123 	{
124 		for(i=0;i<worklen;i++)
125 		{
126 			err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]);
127 			if( err ) break;
128 		}
129 	}
130 	if( i<worklen )
131 	{
132 		for(j=0;j<i;j++) lis_vector_destroy(work[j]);
133 		lis_free(work);
134 		return err;
135 	}
136 	solver->worklen = worklen;
137 	solver->work    = work;
138 
139 	LIS_DEBUG_FUNC_OUT;
140 	return LIS_SUCCESS;
141 }
142 
143 #undef __FUNC__
144 #define __FUNC__ "lis_bicgsafe"
lis_bicgsafe(LIS_SOLVER solver)145 LIS_INT lis_bicgsafe(LIS_SOLVER solver)
146 {
147 	LIS_Comm comm;
148 	LIS_MATRIX A;
149 	LIS_VECTOR x;
150 	LIS_VECTOR r, rtld, mr, amr, t, mt, p, ap;
151 	LIS_VECTOR y, u, au, z;
152 	LIS_SCALAR alpha, beta;
153 	LIS_SCALAR rho, rho_old;
154 	LIS_SCALAR qsi, eta;
155 	LIS_SCALAR tmp, tmpdot[5];
156 	LIS_REAL bnrm2, nrm2, tol;
157 	LIS_INT iter,maxiter,output,conv;
158 	double time,ptime;
159 
160 	LIS_DEBUG_FUNC_IN;
161 
162 	comm = LIS_COMM_WORLD;
163 
164 	A       = solver->A;
165 	x       = solver->x;
166 	maxiter = solver->options[LIS_OPTIONS_MAXITER];
167 	output  = solver->options[LIS_OPTIONS_OUTPUT];
168 	conv    = solver->options[LIS_OPTIONS_CONV_COND];
169 	ptime   = 0.0;
170 
171 	rtld    = solver->work[0];
172 	r       = solver->work[1];
173 	mr      = solver->work[2];
174 	amr     = solver->work[3];
175 	p       = solver->work[4];
176 	ap      = solver->work[5];
177 	t       = solver->work[6];
178 	mt      = solver->work[7];
179 	y       = solver->work[8];
180 	u       = solver->work[9];
181 	z       = solver->work[10];
182 	au      = solver->work[11];
183 
184 
185 	/* Initial Residual */
186 	if( lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2) )
187 	{
188 		LIS_DEBUG_FUNC_OUT;
189 		return LIS_SUCCESS;
190 	}
191 	tol     = solver->tol;
192 
193 	lis_solver_set_shadowresidual(solver,r,rtld);
194 
195 	time = lis_wtime();
196 	lis_psolve(solver, r, mr);
197 	ptime += lis_wtime()-time;
198 	lis_matvec(A,mr,amr);
199 	lis_vector_dot(rtld,r,&rho_old);
200 	lis_vector_copy(amr,ap);
201 	lis_vector_copy(mr,p);
202 	beta = 0.0;
203 
204 
205 	for( iter=1; iter<=maxiter; iter++ )
206 	{
207 		/* tmpdot[0] = <rtld,ap> */
208 		/* alpha = rho_old / tmpdot[0] */
209 		lis_vector_dot(rtld,ap,&tmpdot[0]);
210 		alpha = rho_old / tmpdot[0];
211 
212 
213 		/* tmpdot[0] = <y,y>           */
214 		/* tmpdot[1] = <amr,r>         */
215 		/* tmpdot[2] = <y,r>           */
216 		/* tmpdot[3] = <amr,y>         */
217 		/* tmpdot[4] = <amr,amr>       */
218 		lis_vector_dot(y,y,&tmpdot[0]);
219 		lis_vector_dot(amr,r,&tmpdot[1]);
220 		lis_vector_dot(y,r,&tmpdot[2]);
221 		lis_vector_dot(amr,y,&tmpdot[3]);
222 		lis_vector_dot(amr,amr,&tmpdot[4]);
223 		if(iter==1)
224 		{
225 			qsi = tmpdot[1] / tmpdot[4];
226 			eta = 0.0;
227 		}
228 		else
229 		{
230 			tmp = tmpdot[4]*tmpdot[0] - tmpdot[3]*tmpdot[3];
231 			qsi = (tmpdot[0]*tmpdot[1] - tmpdot[2]*tmpdot[3]) / tmp;
232 			eta = (tmpdot[4]*tmpdot[2] - tmpdot[3]*tmpdot[1]) / tmp;
233 		}
234 
235 		/* t = qsi*ap + eta*y */
236 		lis_vector_copy(y,t);
237 		lis_vector_scale(eta,t);
238 		lis_vector_axpy(qsi,ap,t);
239 
240 		/* mt  = M^-1 * t */
241 		time = lis_wtime();
242 		lis_psolve(solver, t, mt);
243 		ptime += lis_wtime()-time;
244 
245 		/* u    = mt + eta*beta*u */
246 		/* au = A * u             */
247 		lis_vector_xpay(mt,eta*beta,u);
248 		lis_matvec(A,u,au);
249 
250 		/* z = qsi*mr + eta*z - alpha*u */
251 		lis_vector_scale(eta,z);
252 		lis_vector_axpy(qsi,mr,z);
253 		lis_vector_axpy(-alpha,u,z);
254 
255 		/* y = qsi*amr + eta*y - alpha*au */
256 		lis_vector_scale(eta,y);
257 		lis_vector_axpy(qsi,amr,y);
258 		lis_vector_axpy(-alpha,au,y);
259 
260 		/* x = x + alpha*p + z */
261 		lis_vector_axpy(alpha,p,x);
262 		lis_vector_axpy(1.0,z,x);
263 
264 		/* r = r - alpha*ap - y */
265 		lis_vector_axpy(-alpha,ap,r);
266 		lis_vector_axpy(-1.0,y,r);
267 
268 		/* convergence check */
269 		lis_solver_get_residual[conv](r,solver,&nrm2);
270 		if( output )
271 		{
272 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2;
273 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
274 		}
275 
276 		if( tol >= nrm2 )
277 		{
278 			solver->retcode    = LIS_SUCCESS;
279 			solver->iter       = iter;
280 			solver->resid      = nrm2;
281 			solver->ptime      = ptime;
282 			LIS_DEBUG_FUNC_OUT;
283 			return LIS_SUCCESS;
284 		}
285 
286 		/* rho = <rtld,r> */
287 		lis_vector_dot(rtld,r,&rho);
288 		if( rho==0.0 )
289 		{
290 			solver->retcode   = LIS_BREAKDOWN;
291 			solver->iter      = iter;
292 			solver->resid     = nrm2;
293 			LIS_DEBUG_FUNC_OUT;
294 			return LIS_BREAKDOWN;
295 		}
296 
297 		/* beta = (rho / rho_old) * (alpha / qsi) */
298 		beta = (rho / rho_old) * (alpha / qsi);
299 
300 		/* mr  = M^-1 * r */
301 		/* amr = A * mr   */
302 		time = lis_wtime();
303 		lis_psolve(solver, r, mr);
304 		ptime += lis_wtime()-time;
305 		lis_matvec(A,mr,amr);
306 
307 		/* p  = mr + beta*(p - u)    */
308 		/* ap = amr + beta*(ap - au) */
309 		lis_vector_axpy(-1.0,u,p);
310 		lis_vector_xpay(mr,beta,p);
311 		lis_vector_axpy(-1.0,au,ap);
312 		lis_vector_xpay(amr,beta,ap);
313 
314 		rho_old = rho;
315 	}
316 
317 	solver->retcode   = LIS_MAXITER;
318 	solver->iter      = iter;
319 	solver->resid     = nrm2;
320 	LIS_DEBUG_FUNC_OUT;
321 	return LIS_MAXITER;
322 }
323 
324 
325 
326 #ifdef USE_QUAD_PRECISION
327 #undef __FUNC__
328 #define __FUNC__ "lis_bicgsafe_quad"
lis_bicgsafe_quad(LIS_SOLVER solver)329 LIS_INT lis_bicgsafe_quad(LIS_SOLVER solver)
330 {
331 	LIS_Comm comm;
332 	LIS_MATRIX A;
333 	LIS_VECTOR x;
334 	LIS_VECTOR r, rtld, rhat, p, ptld;
335 	LIS_VECTOR t, ttld;
336 	LIS_VECTOR y, v, u, utld, z;
337 	LIS_QUAD_PTR alpha, beta, rho, rho_old;
338 	LIS_QUAD_PTR qsi, eta;
339 	LIS_QUAD_PTR tmp, tmpdot[5],one;
340 	LIS_REAL bnrm2, nrm2, tol;
341 	LIS_INT iter,maxiter,output,conv;
342 	double time,ptime;
343 
344 	LIS_DEBUG_FUNC_IN;
345 
346 	comm = LIS_COMM_WORLD;
347 
348 	A       = solver->A;
349 	x       = solver->x;
350 	maxiter = solver->options[LIS_OPTIONS_MAXITER];
351 	output  = solver->options[LIS_OPTIONS_OUTPUT];
352 	conv    = solver->options[LIS_OPTIONS_CONV_COND];
353 	ptime   = 0.0;
354 
355 
356 	rtld    = solver->work[0];
357 	r       = solver->work[1];
358 	rhat    = solver->work[2];
359 	p       = solver->work[3];
360 	ptld    = solver->work[4];
361 	t       = solver->work[5];
362 	ttld    = solver->work[6];
363 	y       = solver->work[7];
364 	v       = solver->work[8];
365 	u       = solver->work[9];
366 	z       = solver->work[10];
367 	utld    = solver->work[11];
368 
369 	LIS_QUAD_SCALAR_MALLOC(alpha,0,1);
370 	LIS_QUAD_SCALAR_MALLOC(beta,1,1);
371 	LIS_QUAD_SCALAR_MALLOC(rho,2,1);
372 	LIS_QUAD_SCALAR_MALLOC(rho_old,3,1);
373 	LIS_QUAD_SCALAR_MALLOC(qsi,4,1);
374 	LIS_QUAD_SCALAR_MALLOC(eta,5,1);
375 	LIS_QUAD_SCALAR_MALLOC(tmp,6,1);
376 	LIS_QUAD_SCALAR_MALLOC(tmpdot[0],7,1);
377 	LIS_QUAD_SCALAR_MALLOC(tmpdot[1],8,1);
378 	LIS_QUAD_SCALAR_MALLOC(tmpdot[2],9,1);
379 	LIS_QUAD_SCALAR_MALLOC(tmpdot[3],10,1);
380 	LIS_QUAD_SCALAR_MALLOC(tmpdot[4],11,1);
381 	LIS_QUAD_SCALAR_MALLOC(one,13,1);
382 
383 	rho_old.hi[0] = 1.0;
384 	rho_old.lo[0] = 0.0;
385 	alpha.hi[0] = 1.0;
386 	alpha.lo[0] = 0.0;
387 	qsi.hi[0] = 1.0;
388 	qsi.lo[0] = 0.0;
389 	one.hi[0] = -1.0;
390 	one.lo[0] = 0.0;
391 
392 
393 	/* Initial Residual */
394 	if( lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2) )
395 	{
396 		LIS_DEBUG_FUNC_OUT;
397 		return LIS_SUCCESS;
398 	}
399 	tol     = solver->tol;
400 
401 	lis_solver_set_shadowresidual(solver,r,rtld);
402 
403 	lis_vector_set_allex_nm(0.0,p);
404 	lis_vector_set_allex_nm(0.0,u);
405 	lis_vector_set_allex_nm(0.0,ptld);
406 	lis_vector_set_allex_nm(0.0,utld);
407 
408 	for( iter=1; iter<=maxiter; iter++ )
409 	{
410 		/* rho = <rtld,r> */
411 		lis_vector_dotex_mmm(rtld,r,&rho);
412 
413 		/* test breakdown */
414 		if( rho.hi[0]==0.0 && rho.lo[0]==0.0 )
415 		{
416 			solver->retcode   = LIS_BREAKDOWN;
417 			solver->iter      = iter;
418 			solver->resid     = nrm2;
419 			LIS_DEBUG_FUNC_OUT;
420 			return LIS_BREAKDOWN;
421 		}
422 
423 		/* beta = (rho / rho_old) * (alpha / qsi) */
424 		lis_quad_div((LIS_QUAD *)beta.hi,(LIS_QUAD *)rho.hi,(LIS_QUAD *)rho_old.hi);
425 		lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)alpha.hi,(LIS_QUAD *)qsi.hi);
426 		lis_quad_mul((LIS_QUAD *)beta.hi,(LIS_QUAD *)beta.hi,(LIS_QUAD *)tmp.hi);
427 
428 		/* rhat = M^-1 * r */
429 		/* v    = A * rhat */
430 		time = lis_wtime();
431 		lis_psolve(solver, r, rhat);
432 		ptime += lis_wtime()-time;
433 		lis_matvec(A,rhat,v);
434 
435 		/* p = rhat + beta*(p - u) */
436 		lis_vector_axpyex_mmm(one,u,p);
437 		lis_vector_xpayex_mmm(rhat,beta,p);
438 
439 		/* ptld = v + beta*(ptld - utld) */
440 		lis_vector_axpyex_mmm(one,utld,ptld);
441 		lis_vector_xpayex_mmm(v,beta,ptld);
442 
443 		/* tmpdot[0] = <rtld,ptld> */
444 		lis_vector_dotex_mmm(rtld,ptld,&tmpdot[0]);
445 		/* test breakdown */
446 		/* */
447 
448 		/* alpha = rho / tmpdot[0] */
449 		lis_quad_div((LIS_QUAD *)alpha.hi,(LIS_QUAD *)rho.hi,(LIS_QUAD *)tmpdot[0].hi);
450 
451 
452 		/* tmpdot[0] = <y,y>       */
453 		/* tmpdot[1] = <v,r>       */
454 		/* tmpdot[2] = <y,r>       */
455 		/* tmpdot[3] = <v,y>       */
456 		/* tmpdot[4] = <v,v>       */
457 		lis_vector_dotex_mmm(y,y,&tmpdot[0]);
458 		lis_vector_dotex_mmm(v,r,&tmpdot[1]);
459 		lis_vector_dotex_mmm(y,r,&tmpdot[2]);
460 		lis_vector_dotex_mmm(v,y,&tmpdot[3]);
461 		lis_vector_dotex_mmm(v,v,&tmpdot[4]);
462 		if(iter==1)
463 		{
464 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[1].hi,(LIS_QUAD *)tmpdot[4].hi);
465 			eta.hi[0] = 0.0;
466 			eta.lo[0] = 0.0;
467 		}
468 		else
469 		{
470 			lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[0].hi);
471 			lis_quad_sqr((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[3].hi);
472 			lis_quad_sub((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)qsi.hi);
473 
474 			lis_quad_mul((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[1].hi);
475 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[2].hi,(LIS_QUAD *)tmpdot[3].hi);
476 			lis_quad_sub((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)eta.hi);
477 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmp.hi);
478 
479 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[2].hi);
480 			lis_quad_mul((LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[3].hi,(LIS_QUAD *)tmpdot[1].hi);
481 			lis_quad_sub((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[0].hi);
482 			lis_quad_div((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmp.hi);
483 		}
484 
485 		/* t = qsi*ptld + eta*y */
486 		lis_vector_copyex_mm(y,t);
487 		lis_vector_scaleex_mm(eta,t);
488 		lis_vector_axpyex_mmm(qsi,ptld,t);
489 
490 		/* ttld  = M^-1 * t */
491 		time = lis_wtime();
492 		lis_psolve(solver, t, ttld);
493 		ptime += lis_wtime()-time;
494 
495 		/* u    = ttld + eta*beta*u */
496 		/* utld = A * u             */
497 		lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)beta.hi);
498 		lis_vector_xpayex_mmm(ttld,tmp,u);
499 		lis_matvec(A,u,utld);
500 
501 		/* z = qsi*rhat + eta*z - alpha*u */
502 		lis_vector_scaleex_mm(eta,z);
503 		lis_vector_axpyex_mmm(qsi,rhat,z);
504 		lis_quad_minus((LIS_QUAD *)alpha.hi);
505 		lis_vector_axpyex_mmm(alpha,u,z);
506 
507 		/* y = qsi*v + eta*y - alpha*utld */
508 		lis_vector_scaleex_mm(eta,y);
509 		lis_vector_axpyex_mmm(qsi,v,y);
510 		lis_vector_axpyex_mmm(alpha,utld,y);
511 		lis_quad_minus((LIS_QUAD *)alpha.hi);
512 
513 		/* x = x + alpha*p + z */
514 		lis_vector_axpyex_mmm(alpha,p,x);
515 		lis_quad_minus((LIS_QUAD *)one.hi);
516 		lis_vector_axpyex_mmm(one,z,x);
517 		lis_quad_minus((LIS_QUAD *)one.hi);
518 
519 		/* r = r - alpha*ptld - y */
520 		lis_quad_minus((LIS_QUAD *)alpha.hi);
521 		lis_vector_axpyex_mmm(alpha,ptld,r);
522 		lis_quad_minus((LIS_QUAD *)alpha.hi);
523 		lis_vector_axpyex_mmm(one,y,r);
524 
525 		/* convergence check */
526 		lis_solver_get_residual[conv](r,solver,&nrm2);
527 		if( output )
528 		{
529 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2;
530 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
531 		}
532 
533 		if( tol > nrm2 )
534 		{
535 			solver->retcode    = LIS_SUCCESS;
536 			solver->iter       = iter;
537 			solver->resid      = nrm2;
538 			solver->ptime      = ptime;
539 			LIS_DEBUG_FUNC_OUT;
540 			return LIS_SUCCESS;
541 		}
542 
543 		rho_old.hi[0] = rho.hi[0];
544 		rho_old.lo[0] = rho.lo[0];
545 	}
546 
547 	solver->retcode   = LIS_MAXITER;
548 	solver->iter      = iter;
549 	solver->resid     = nrm2;
550 	LIS_DEBUG_FUNC_OUT;
551 	return LIS_MAXITER;
552 }
553 
554 
555 #undef __FUNC__
556 #define __FUNC__ "lis_bicgsafe_switch"
lis_bicgsafe_switch(LIS_SOLVER solver)557 LIS_INT lis_bicgsafe_switch(LIS_SOLVER solver)
558 {
559 	LIS_Comm comm;
560 	LIS_MATRIX A;
561 	LIS_VECTOR x;
562 	LIS_VECTOR r, rtld, rhat, p, ptld, phat;
563 	LIS_VECTOR t, ttld, that, t0, t0hat;
564 	LIS_VECTOR y, w, u, z;
565 	LIS_QUAD_PTR alpha, beta, rho, rho_old;
566 	LIS_QUAD_PTR qsi, eta, one;
567 	LIS_QUAD_PTR tmp, tmpdot[5];
568 	LIS_REAL bnrm2, nrm2, tol, tol2;
569 	LIS_INT iter,maxiter,output,conv;
570 	LIS_INT iter2,maxiter2;
571 	double time,ptime;
572 
573 
574 	LIS_DEBUG_FUNC_IN;
575 
576 	comm = LIS_COMM_WORLD;
577 
578 	A       = solver->A;
579 	x       = solver->x;
580 	maxiter  = solver->options[LIS_OPTIONS_MAXITER];
581 	maxiter2 = solver->options[LIS_OPTIONS_SWITCH_MAXITER];
582 	output   = solver->options[LIS_OPTIONS_OUTPUT];
583 	conv    = solver->options[LIS_OPTIONS_CONV_COND];
584 	tol      = solver->params[LIS_PARAMS_RESID-LIS_OPTIONS_LEN];
585 	tol2     = solver->params[LIS_PARAMS_SWITCH_RESID-LIS_OPTIONS_LEN];
586 	ptime   = 0.0;
587 
588 	rtld    = solver->work[0];
589 	r       = solver->work[1];
590 	rhat    = solver->work[2];
591 	p       = solver->work[3];
592 	ptld    = solver->work[4];
593 	phat    = solver->work[5];
594 	t       = solver->work[6];
595 	ttld    = solver->work[7];
596 	that    = solver->work[8];
597 	t0      = solver->work[9];
598 	t0hat   = solver->work[10];
599 	y       = solver->work[11];
600 	w       = solver->work[12];
601 	u       = solver->work[13];
602 	z       = solver->work[14];
603 
604 	LIS_QUAD_SCALAR_MALLOC(alpha,0,1);
605 	LIS_QUAD_SCALAR_MALLOC(beta,1,1);
606 	LIS_QUAD_SCALAR_MALLOC(rho,2,1);
607 	LIS_QUAD_SCALAR_MALLOC(rho_old,3,1);
608 	LIS_QUAD_SCALAR_MALLOC(qsi,4,1);
609 	LIS_QUAD_SCALAR_MALLOC(eta,5,1);
610 	LIS_QUAD_SCALAR_MALLOC(tmp,6,1);
611 	LIS_QUAD_SCALAR_MALLOC(tmpdot[0],7,1);
612 	LIS_QUAD_SCALAR_MALLOC(tmpdot[1],8,1);
613 	LIS_QUAD_SCALAR_MALLOC(tmpdot[2],9,1);
614 	LIS_QUAD_SCALAR_MALLOC(tmpdot[3],10,1);
615 	LIS_QUAD_SCALAR_MALLOC(tmpdot[4],11,1);
616 	LIS_QUAD_SCALAR_MALLOC(one,13,1);
617 
618 	rho_old.hi[0] = 1.0;
619 	rho_old.lo[0] = 0.0;
620 	alpha.hi[0] = 1.0;
621 	alpha.lo[0] = 0.0;
622 	qsi.hi[0] = 1.0;
623 	qsi.lo[0] = 0.0;
624 	one.hi[0] = -1.0;
625 	one.lo[0] = 0.0;
626 
627 
628 	/* Initial Residual */
629 	if( lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2) )
630 	{
631 		LIS_DEBUG_FUNC_OUT;
632 		return LIS_SUCCESS;
633 	}
634 	tol2     = solver->tol_switch;
635 
636 	lis_solver_set_shadowresidual(solver,r,rtld);
637 
638 	lis_vector_set_allex_nm(0.0, ttld);
639 	lis_vector_set_allex_nm(0.0, ptld);
640 	lis_vector_set_allex_nm(0.0, p);
641 	lis_vector_set_allex_nm(0.0, u);
642 	lis_vector_set_allex_nm(0.0, t);
643 	lis_vector_set_allex_nm(0.0, t0);
644 
645 	for( iter=1; iter<=maxiter2; iter++ )
646 	{
647 		/* rho = <rtld,r> */
648 		lis_vector_dot(rtld,r,&rho.hi[0]);
649 
650 		/* test breakdown */
651 		if( rho.hi[0]==0.0 )
652 		{
653 			solver->retcode   = LIS_BREAKDOWN;
654 			solver->iter      = iter;
655 			solver->iter2     = iter;
656 			solver->resid     = nrm2;
657 			LIS_DEBUG_FUNC_OUT;
658 			return LIS_BREAKDOWN;
659 		}
660 
661 		/* beta = (rho / rho_old) * (alpha / qsi) */
662 		beta.hi[0] = (rho.hi[0] / rho_old.hi[0]) * (alpha.hi[0] / qsi.hi[0]);
663 
664 		/* w = ttld + beta*ptld */
665 		lis_vector_axpyz(beta.hi[0],ptld,ttld,w);
666 
667 		/* rhat = M^-1 * r */
668 		time = lis_wtime();
669 		lis_psolve(solver, r, rhat);
670 		ptime += lis_wtime()-time;
671 
672 		/* p = rhat + beta*(p - u) */
673 		lis_vector_axpy(-1,u,p);
674 		lis_vector_xpay(rhat,beta.hi[0],p);
675 
676 		/* ptld = A * p */
677 		lis_matvec(A,p,ptld);
678 
679 		/* tmpdot[0] = <rtld,ptld> */
680 		lis_vector_dot(rtld,ptld,&tmpdot[0].hi[0]);
681 		/* test breakdown */
682 		/* */
683 
684 		/* alpha = rho / tmpdot[0] */
685 		alpha.hi[0] = rho.hi[0] / tmpdot[0].hi[0];
686 
687 		/* y = t - r + alpha*(-w + ptld) */
688 		lis_vector_axpyz(-1,w,ptld,y);
689 		lis_vector_xpay(t,alpha.hi[0],y);
690 		lis_vector_axpy(-1,r,y);
691 
692 		/* t = r - alpha*ptld */
693 		lis_vector_axpyz(-alpha.hi[0],ptld,r,t);
694 
695 		/* that  = M^-1 * t */
696 		/* phat  = M^-1 * ptld */
697 		/* t0hat = M^-1 * t0 */
698 		time = lis_wtime();
699 		lis_psolve(solver, t, that);
700 		lis_psolve(solver, ptld, phat);
701 		lis_psolve(solver, t0, t0hat);
702 		ptime += lis_wtime()-time;
703 
704 		/* ttld = A * that */
705 		lis_matvec(A,that,ttld);
706 
707 		/* tmpdot[0] = <y,y>       */
708 		/* tmpdot[1] = <ttld,t>    */
709 		/* tmpdot[2] = <y,t>       */
710 		/* tmpdot[3] = <ttld,y>    */
711 		/* tmpdot[4] = <ttld,ttld> */
712 		lis_vector_dot(y,y,&tmpdot[0].hi[0]);
713 		lis_vector_dot(ttld,t,&tmpdot[1].hi[0]);
714 		lis_vector_dot(y,t,&tmpdot[2].hi[0]);
715 		lis_vector_dot(ttld,y,&tmpdot[3].hi[0]);
716 		lis_vector_dot(ttld,ttld,&tmpdot[4].hi[0]);
717 		if(iter==1)
718 		{
719 			qsi.hi[0] = tmpdot[1].hi[0] / tmpdot[4].hi[0];
720 			eta.hi[0] = 0.0;
721 		}
722 		else
723 		{
724 			tmp.hi[0] = tmpdot[4].hi[0]*tmpdot[0].hi[0]  - tmpdot[3].hi[0]*tmpdot[3].hi[0];
725 			qsi.hi[0] = (tmpdot[0].hi[0]*tmpdot[1].hi[0] - tmpdot[2].hi[0]*tmpdot[3].hi[0]) / tmp.hi[0];
726 			eta.hi[0] = (tmpdot[4].hi[0]*tmpdot[2].hi[0] - tmpdot[3].hi[0]*tmpdot[1].hi[0]) / tmp.hi[0];
727 		}
728 
729 		/* u = qsi*phat + eta*(t0hat - rhat + beta*u) */
730 		lis_vector_xpay(t0hat,beta.hi[0],u);
731 		lis_vector_axpy(-1,rhat,u);
732 		lis_vector_scale(eta.hi[0],u);
733 		lis_vector_axpy(qsi.hi[0],phat,u);
734 
735 		/* z = qsi*rhat + eta*z - alpha*u */
736 		lis_vector_scale(eta.hi[0],z);
737 		lis_vector_axpy(qsi.hi[0],rhat,z);
738 		lis_vector_axpy(-alpha.hi[0],u,z);
739 
740 		/* x = x + alpha*p + z */
741 		lis_vector_axpy(alpha.hi[0],p,x);
742 		lis_vector_axpy(1,z,x);
743 
744 		/* r = t - eta*y - qsi*ttld */
745 		lis_vector_axpyz(-eta.hi[0],y,t,r);
746 		lis_vector_axpy(-qsi.hi[0],ttld,r);
747 
748 		/* convergence check */
749 		lis_solver_get_residual[conv](r,solver,&nrm2);
750 		if( output )
751 		{
752 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2;
753 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
754 		}
755 
756 		if( tol2 >= nrm2 )
757 		{
758 			solver->iter       = iter;
759 			solver->iter2      = iter;
760 			solver->ptime      = ptime;
761 			break;
762 		}
763 
764 		lis_vector_copy(t,t0);
765 		rho_old.hi[0] = rho.hi[0];
766 	}
767 
768 	r->precision = LIS_PRECISION_QUAD;
769 	p->precision = LIS_PRECISION_QUAD;
770 	t->precision = LIS_PRECISION_QUAD;
771 	t0->precision = LIS_PRECISION_QUAD;
772 	ptld->precision = LIS_PRECISION_QUAD;
773 	that->precision = LIS_PRECISION_QUAD;
774 
775 	solver->options[LIS_OPTIONS_INITGUESS_ZEROS] = LIS_FALSE;
776 	lis_vector_copyex_mn(x,solver->xx);
777 
778 	rho_old.hi[0] = 1.0;
779 	alpha.hi[0] = 1.0;
780 	qsi.hi[0] = 1.0;
781 	one.hi[0] = -1.0;
782 
783 	/* Initial Residual */
784 	lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2);
785 	tol     = solver->tol;
786 
787 	lis_solver_set_shadowresidual(solver,r,rtld);
788 
789 	lis_vector_set_allex_nm(0.0, ttld);
790 	lis_vector_set_allex_nm(0.0, ptld);
791 	lis_vector_set_allex_nm(0.0, p);
792 	lis_vector_set_allex_nm(0.0, u);
793 	lis_vector_set_allex_nm(0.0, t);
794 	lis_vector_set_allex_nm(0.0, t0);
795 
796 	for( iter2=iter+1; iter2<=maxiter; iter2++ )
797 	{
798 		/* rho = <rtld,r> */
799 		lis_vector_dotex_mmm(rtld,r,&rho);
800 
801 		/* test breakdown */
802 		if( rho.hi[0]==0.0 && rho.lo[0]==0.0 )
803 		{
804 			solver->retcode   = LIS_BREAKDOWN;
805 			solver->iter      = iter2;
806 			solver->iter2     = iter;
807 			solver->resid     = nrm2;
808 			LIS_DEBUG_FUNC_OUT;
809 			return LIS_BREAKDOWN;
810 		}
811 
812 		/* beta = (rho / rho_old) * (alpha / qsi) */
813 		lis_quad_div((LIS_QUAD *)beta.hi,(LIS_QUAD *)rho.hi,(LIS_QUAD *)rho_old.hi);
814 		lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)alpha.hi,(LIS_QUAD *)qsi.hi);
815 		lis_quad_mul((LIS_QUAD *)beta.hi,(LIS_QUAD *)beta.hi,(LIS_QUAD *)tmp.hi);
816 
817 		/* w = ttld + beta*ptld */
818 		lis_vector_axpyzex_mmmm(beta,ptld,ttld,w);
819 
820 		/* rhat = M^-1 * r */
821 		time = lis_wtime();
822 		lis_psolve(solver, r, rhat);
823 		ptime += lis_wtime()-time;
824 
825 		/* p = rhat + beta*(p - u) */
826 		lis_vector_axpyex_mmm(one,u,p);
827 		lis_vector_xpayex_mmm(rhat,beta,p);
828 
829 		/* ptld = A * p */
830 		lis_matvec(A,p,ptld);
831 
832 		/* tmpdot[0] = <rtld,ptld> */
833 		lis_vector_dotex_mmm(rtld,ptld,&tmpdot[0]);
834 		/* test breakdown */
835 		/* */
836 
837 		/* alpha = rho / tmpdot[0] */
838 		lis_quad_div((LIS_QUAD *)alpha.hi,(LIS_QUAD *)rho.hi,(LIS_QUAD *)tmpdot[0].hi);
839 
840 		/* y = t - r + alpha*(-w + ptld) */
841 		lis_vector_axpyzex_mmmm(one,w,ptld,y);
842 		lis_vector_xpayex_mmm(t,alpha,y);
843 		lis_vector_axpyex_mmm(one,r,y);
844 
845 		/* t = r - alpha*ptld */
846 		lis_quad_minus((LIS_QUAD *)alpha.hi);
847 		lis_vector_axpyzex_mmmm(alpha,ptld,r,t);
848 
849 		/* that  = M^-1 * t */
850 		/* phat  = M^-1 * ptld */
851 		/* t0hat = M^-1 * t0 */
852 		time = lis_wtime();
853 		lis_psolve(solver, t, that);
854 		lis_psolve(solver, ptld, phat);
855 		lis_psolve(solver, t0, t0hat);
856 		ptime += lis_wtime()-time;
857 
858 		/* ttld = A * that */
859 		lis_matvec(A,that,ttld);
860 
861 		/* tmpdot[0] = <y,y>       */
862 		/* tmpdot[1] = <ttld,t>    */
863 		/* tmpdot[2] = <y,t>       */
864 		/* tmpdot[3] = <ttld,y>    */
865 		/* tmpdot[4] = <ttld,ttld> */
866 		lis_vector_dotex_mmm(y,y,&tmpdot[0]);
867 		lis_vector_dotex_mmm(ttld,t,&tmpdot[1]);
868 		lis_vector_dotex_mmm(y,t,&tmpdot[2]);
869 		lis_vector_dotex_mmm(ttld,y,&tmpdot[3]);
870 		lis_vector_dotex_mmm(ttld,ttld,&tmpdot[4]);
871 		if(iter==1)
872 		{
873 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[1].hi,(LIS_QUAD *)tmpdot[4].hi);
874 			eta.hi[0] = 0.0;
875 			eta.lo[0] = 0.0;
876 		}
877 		else
878 		{
879 			lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[0].hi);
880 			lis_quad_sqr((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[3].hi);
881 			lis_quad_sub((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)qsi.hi);
882 
883 			lis_quad_mul((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[1].hi);
884 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[2].hi,(LIS_QUAD *)tmpdot[3].hi);
885 			lis_quad_sub((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)eta.hi);
886 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmp.hi);
887 
888 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[2].hi);
889 			lis_quad_mul((LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[3].hi,(LIS_QUAD *)tmpdot[1].hi);
890 			lis_quad_sub((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[0].hi);
891 			lis_quad_div((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmp.hi);
892 		}
893 
894 		/* u = qsi*phat + eta*(t0hat - rhat + beta*u) */
895 		lis_vector_xpayex_mmm(t0hat,beta,u);
896 		lis_vector_axpyex_mmm(one,rhat,u);
897 		lis_vector_scaleex_mm(eta,u);
898 		lis_vector_axpyex_mmm(qsi,phat,u);
899 
900 		/* z = qsi*rhat + eta*z - alpha*u */
901 		lis_vector_scaleex_mm(eta,z);
902 		lis_vector_axpyex_mmm(qsi,rhat,z);
903 		lis_vector_axpyex_mmm(alpha,u,z);
904 
905 		/* x = x + alpha*p + z */
906 		lis_quad_minus((LIS_QUAD *)alpha.hi);
907 		lis_quad_minus((LIS_QUAD *)one.hi);
908 		lis_vector_axpyex_mmm(alpha,p,x);
909 		lis_vector_axpyex_mmm(one,z,x);
910 		lis_quad_minus((LIS_QUAD *)one.hi);
911 
912 		/* r = t - eta*y - qsi*ttld */
913 		lis_quad_minus((LIS_QUAD *)eta.hi);
914 		lis_quad_minus((LIS_QUAD *)qsi.hi);
915 		lis_vector_axpyzex_mmmm(eta,y,t,r);
916 		lis_vector_axpyex_mmm(qsi,ttld,r);
917 		lis_quad_minus((LIS_QUAD *)eta.hi);
918 		lis_quad_minus((LIS_QUAD *)qsi.hi);
919 
920 		/* convergence check */
921 		lis_solver_get_residual[conv](r,solver,&nrm2);
922 		if( output )
923 		{
924 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter2] = nrm2;
925 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
926 		}
927 
928 		if( tol > nrm2 )
929 		{
930 			solver->retcode    = LIS_SUCCESS;
931 			solver->iter       = iter2;
932 			solver->iter2      = iter;
933 			solver->resid      = nrm2;
934 			solver->ptime      = ptime;
935 			LIS_DEBUG_FUNC_OUT;
936 			return LIS_SUCCESS;
937 		}
938 
939 		lis_vector_copyex_mm(t,t0);
940 		rho_old.hi[0] = rho.hi[0];
941 		rho_old.lo[0] = rho.lo[0];
942 	}
943 	solver->retcode   = LIS_MAXITER;
944 	solver->iter      = iter;
945 	solver->iter2     = iter2;
946 	solver->resid     = nrm2;
947 	LIS_DEBUG_FUNC_OUT;
948 	return LIS_MAXITER;
949 }
950 #endif
951 
952 /********************************************
953  * Preconditioned BiConjugate Residual Safe *
954  ********************************************
955  r(0)    = b - Ax(0)
956  rtld(0) = conj(r(0)) or random
957  artld(0)= A^H * rtld(0)
958  mr(0)   = M^-1 * r(0)
959  amr(0)  = A * mr(0)
960  p(0)    = mr(0)
961  ap(0)   = amr(0)
962  rho(0)  = <rtld,amr(0)>
963  ********************************************
964  for k=1,2,...
965    map(k-1)= M^-1 * ap(k-1)
966    tmpdot0 = <map(k-1),artld(0)>
967    alpha   = rho(k-1) / tmpdot0
968    tmpdot0 = <y(k-1),y(k-1)>
969    tmpdot1 = <amr(k-1),r(k-1)>
970    tmpdot2 = <y(k-1),r(k-1)>
971    tmpdot3 = <amr(k-1),y(k-1)>
972    tmpdot4 = <amr(k-1),amr(k-1)>
973    tmp     = tmpdot4*tmpdot0-tmpdot3*tmpdot3
974    qsi     = (tmpdot0*tmpdot1-tmpdot2*tmpdot3) / tmp
975    eta     = (tmpdot4*tmpdot2-tmpdot3*tmpdot1) / tmp
976    u(k-1)  = qsi*map(k-1) + eta*my(k) + eta*beta*u(k-2)
977    au(k-1) = A * u(k-1)
978    z(k-1)  = qsi*mr(k-1) + eta*z(k-2) - alpha*u(k-1)
979    y(k)    = qsi*amr(k-1) + eta*y(k-1) - alpha*au(k-1)
980    my(k)   = M^-1 * y(k)
981    x(k)    = x(k-1) + alpha*p(k-1) + z(k-1)
982    r(k)    = r(k-1) - alpha*ap(k-1) - y(k)
983    mr(k)   = mr(k-1) - alpha*map(k-1) - my(k)
984    amr(k)  = A * mr(k)
985    rho(k)  = <rtld,amr(k)>
986    beta    = (rho(k) / rho(k-1)) * (alpha / qsi)
987    p(k)    = mr(k) + beta*(p(k-1) - u(k-1))
988    ap(k-1) = amr(k) + beta*(ap(k-1) - au(k-1))
989  ********************************************/
990 #undef NWORK
991 #define NWORK 13
992 #undef __FUNC__
993 #define __FUNC__ "lis_bicrsafe_check_params"
lis_bicrsafe_check_params(LIS_SOLVER solver)994 LIS_INT lis_bicrsafe_check_params(LIS_SOLVER solver)
995 {
996 	LIS_DEBUG_FUNC_IN;
997 	LIS_DEBUG_FUNC_OUT;
998 	return LIS_SUCCESS;
999 }
1000 
1001 #undef __FUNC__
1002 #define __FUNC__ "lis_bicrsafe_malloc_work"
lis_bicrsafe_malloc_work(LIS_SOLVER solver)1003 LIS_INT lis_bicrsafe_malloc_work(LIS_SOLVER solver)
1004 {
1005 	LIS_VECTOR *work;
1006 	LIS_INT	i,j,worklen,err;
1007 
1008 	LIS_DEBUG_FUNC_IN;
1009 
1010 	worklen = NWORK;
1011 	work    = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_bicgsafe_malloc_work::work" );
1012 	if( work==NULL )
1013 	{
1014 		LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR));
1015 		return LIS_ERR_OUT_OF_MEMORY;
1016 	}
1017 	if( solver->precision==LIS_PRECISION_DEFAULT )
1018 	{
1019 		for(i=0;i<worklen;i++)
1020 		{
1021 			err = lis_vector_duplicate(solver->A,&work[i]);
1022 			if( err ) break;
1023 		}
1024 	}
1025 	else
1026 	{
1027 		for(i=0;i<worklen;i++)
1028 		{
1029 			err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]);
1030 			if( err ) break;
1031 		}
1032 	}
1033 	if( i<worklen )
1034 	{
1035 		for(j=0;j<i;j++) lis_vector_destroy(work[j]);
1036 		lis_free(work);
1037 		return err;
1038 	}
1039 	solver->worklen = worklen;
1040 	solver->work    = work;
1041 
1042 	LIS_DEBUG_FUNC_OUT;
1043 	return LIS_SUCCESS;
1044 }
1045 
1046 #undef __FUNC__
1047 #define __FUNC__ "lis_bicrsafe"
lis_bicrsafe(LIS_SOLVER solver)1048 LIS_INT lis_bicrsafe(LIS_SOLVER solver)
1049 {
1050 	LIS_Comm comm;
1051 	LIS_MATRIX A;
1052 	LIS_VECTOR x;
1053 	LIS_VECTOR r, rtld, artld, mr, amr, p, ap, map;
1054 	LIS_VECTOR y, my, u, au, z;
1055 	LIS_SCALAR alpha, beta;
1056 	LIS_SCALAR rho, rho_old;
1057 	LIS_SCALAR qsi, eta;
1058 	LIS_SCALAR tmp, tmpdot[5];
1059 	LIS_REAL bnrm2, nrm2, tol;
1060 	LIS_INT iter,maxiter,output,conv;
1061 	double time,ptime;
1062 
1063 	LIS_DEBUG_FUNC_IN;
1064 
1065 	comm = LIS_COMM_WORLD;
1066 
1067 	A       = solver->A;
1068 	x       = solver->x;
1069 	maxiter = solver->options[LIS_OPTIONS_MAXITER];
1070 	output  = solver->options[LIS_OPTIONS_OUTPUT];
1071 	conv    = solver->options[LIS_OPTIONS_CONV_COND];
1072 	ptime   = 0.0;
1073 
1074 	rtld    = solver->work[0];
1075 	r       = solver->work[1];
1076 	mr      = solver->work[2];
1077 	amr     = solver->work[3];
1078 	p       = solver->work[4];
1079 	ap      = solver->work[5];
1080 	map     = solver->work[6];
1081 	my      = solver->work[7];
1082 	y       = solver->work[8];
1083 	u       = solver->work[9];
1084 	z       = solver->work[10];
1085 	au      = solver->work[11];
1086 	artld   = solver->work[12];
1087 
1088 
1089 	/* Initial Residual */
1090 	if( lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2) )
1091 	{
1092 		LIS_DEBUG_FUNC_OUT;
1093 		return LIS_SUCCESS;
1094 	}
1095 	tol     = solver->tol;
1096 
1097 	lis_solver_set_shadowresidual(solver,r,rtld);
1098 
1099 	lis_matvech(A,rtld,artld);
1100 	time = lis_wtime();
1101 	lis_psolve(solver, r, mr);
1102 	ptime += lis_wtime()-time;
1103 	lis_matvec(A,mr,amr);
1104 	lis_vector_dot(rtld,amr,&rho_old);
1105 	lis_vector_copy(amr,ap);
1106 	lis_vector_copy(mr,p);
1107 	beta = 0.0;
1108 
1109 
1110 	for( iter=1; iter<=maxiter; iter++ )
1111 	{
1112 		/* map  = M^-1 * ap */
1113 		time = lis_wtime();
1114 		lis_psolve(solver, ap, map);
1115 		ptime += lis_wtime()-time;
1116 
1117 		/* tmpdot[0] = <artld,map> */
1118 		/* alpha = rho_old / tmpdot[0] */
1119 		lis_vector_dot(artld,map,&tmpdot[0]);
1120 		alpha = rho_old / tmpdot[0];
1121 
1122 
1123 		/* tmpdot[0] = <y,y>           */
1124 		/* tmpdot[1] = <amr,r>         */
1125 		/* tmpdot[2] = <y,r>           */
1126 		/* tmpdot[3] = <amr,y>         */
1127 		/* tmpdot[4] = <amr,amr>       */
1128 		lis_vector_dot(y,y,&tmpdot[0]);
1129 		lis_vector_dot(amr,r,&tmpdot[1]);
1130 		lis_vector_dot(y,r,&tmpdot[2]);
1131 		lis_vector_dot(amr,y,&tmpdot[3]);
1132 		lis_vector_dot(amr,amr,&tmpdot[4]);
1133 		if(iter==1)
1134 		{
1135 			qsi = tmpdot[1] / tmpdot[4];
1136 			eta = 0.0;
1137 		}
1138 		else
1139 		{
1140 			tmp = tmpdot[4]*tmpdot[0] - tmpdot[3]*tmpdot[3];
1141 			qsi = (tmpdot[0]*tmpdot[1] - tmpdot[2]*tmpdot[3]) / tmp;
1142 			eta = (tmpdot[4]*tmpdot[2] - tmpdot[3]*tmpdot[1]) / tmp;
1143 		}
1144 
1145 		/* u    = qsi*map + eta*my + eta*beta*u */
1146 		/* au   = A * u                         */
1147 		lis_vector_scale(eta*beta,u);
1148 		lis_vector_axpy(qsi,map,u);
1149 		lis_vector_axpy(eta,my,u);
1150 		lis_matvec(A,u,au);
1151 
1152 		/* z = qsi*mr + eta*z - alpha*u */
1153 		lis_vector_scale(eta,z);
1154 		lis_vector_axpy(qsi,mr,z);
1155 		lis_vector_axpy(-alpha,u,z);
1156 
1157 		/* y  = qsi*amr + eta*y - alpha*au */
1158 		/* my = M^-1 * y */
1159 		lis_vector_scale(eta,y);
1160 		lis_vector_axpy(qsi,amr,y);
1161 		lis_vector_axpy(-alpha,au,y);
1162 		time = lis_wtime();
1163 		lis_psolve(solver, y, my);
1164 		ptime += lis_wtime()-time;
1165 
1166 		/* x = x + alpha*p + z */
1167 		lis_vector_axpy(alpha,p,x);
1168 		lis_vector_axpy(1.0,z,x);
1169 
1170 		/* r = r - alpha*ap - y */
1171 		lis_vector_axpy(-alpha,ap,r);
1172 		lis_vector_axpy(-1.0,y,r);
1173 
1174 		/* convergence check */
1175 		lis_solver_get_residual[conv](r,solver,&nrm2);
1176 		if( output )
1177 		{
1178 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2;
1179 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
1180 		}
1181 
1182 		if( tol >= nrm2 )
1183 		{
1184 			solver->retcode    = LIS_SUCCESS;
1185 			solver->iter       = iter;
1186 			solver->resid      = nrm2;
1187 			solver->ptime      = ptime;
1188 			LIS_DEBUG_FUNC_OUT;
1189 			return LIS_SUCCESS;
1190 		}
1191 
1192 		/* mr  = mr - alpha*map - my */
1193 		/* amr = A * mr              */
1194 		/* rho = <rtld,amr> */
1195 		lis_vector_axpy(-alpha,map,mr);
1196 		lis_vector_axpy(-1.0,my,mr);
1197 		lis_matvec(A,mr,amr);
1198 		lis_vector_dot(rtld,amr,&rho);
1199 		if( rho==0.0 )
1200 		{
1201 			solver->retcode   = LIS_BREAKDOWN;
1202 			solver->iter      = iter;
1203 			solver->resid     = nrm2;
1204 			LIS_DEBUG_FUNC_OUT;
1205 			return LIS_BREAKDOWN;
1206 		}
1207 
1208 		/* beta = (rho / rho_old) * (alpha / qsi) */
1209 		beta = (rho / rho_old) * (alpha / qsi);
1210 
1211 
1212 		/* p  = mr + beta*(p - u)    */
1213 		/* ap = amr + beta*(ap - au) */
1214 		lis_vector_axpy(-1.0,u,p);
1215 		lis_vector_xpay(mr,beta,p);
1216 		lis_vector_axpy(-1.0,au,ap);
1217 		lis_vector_xpay(amr,beta,ap);
1218 
1219 		rho_old = rho;
1220 	}
1221 
1222 	solver->retcode   = LIS_MAXITER;
1223 	solver->iter      = iter;
1224 	solver->resid     = nrm2;
1225 	LIS_DEBUG_FUNC_OUT;
1226 	return LIS_MAXITER;
1227 }
1228 
1229 #ifdef USE_QUAD_PRECISION
1230 #undef __FUNC__
1231 #define __FUNC__ "lis_bicrsafe_quad"
lis_bicrsafe_quad(LIS_SOLVER solver)1232 LIS_INT lis_bicrsafe_quad(LIS_SOLVER solver)
1233 {
1234 	LIS_Comm comm;
1235 	LIS_MATRIX A;
1236 	LIS_VECTOR x;
1237 	LIS_VECTOR r, rtld, artld, mr, amr, p, ap, map;
1238 	LIS_VECTOR y, my, u, au, z;
1239 	LIS_QUAD_PTR alpha, beta, rho, rho_old;
1240 	LIS_QUAD_PTR qsi, eta, one;
1241 	LIS_QUAD_PTR tmp, tmpdot[5];
1242 	LIS_REAL bnrm2, nrm2, tol;
1243 	LIS_INT iter,maxiter,output,conv;
1244 	double time,ptime;
1245 
1246 	LIS_DEBUG_FUNC_IN;
1247 
1248 	comm = LIS_COMM_WORLD;
1249 
1250 	A       = solver->A;
1251 	x       = solver->x;
1252 	maxiter = solver->options[LIS_OPTIONS_MAXITER];
1253 	output  = solver->options[LIS_OPTIONS_OUTPUT];
1254 	conv    = solver->options[LIS_OPTIONS_CONV_COND];
1255 	ptime   = 0.0;
1256 
1257 	rtld    = solver->work[0];
1258 	r       = solver->work[1];
1259 	mr      = solver->work[2];
1260 	amr     = solver->work[3];
1261 	p       = solver->work[4];
1262 	ap      = solver->work[5];
1263 	map     = solver->work[6];
1264 	my      = solver->work[7];
1265 	y       = solver->work[8];
1266 	u       = solver->work[9];
1267 	z       = solver->work[10];
1268 	au      = solver->work[11];
1269 	artld   = solver->work[12];
1270 
1271 	LIS_QUAD_SCALAR_MALLOC(alpha,0,1);
1272 	LIS_QUAD_SCALAR_MALLOC(beta,1,1);
1273 	LIS_QUAD_SCALAR_MALLOC(rho,2,1);
1274 	LIS_QUAD_SCALAR_MALLOC(rho_old,3,1);
1275 	LIS_QUAD_SCALAR_MALLOC(qsi,4,1);
1276 	LIS_QUAD_SCALAR_MALLOC(eta,5,1);
1277 	LIS_QUAD_SCALAR_MALLOC(tmp,6,1);
1278 	LIS_QUAD_SCALAR_MALLOC(tmpdot[0],7,1);
1279 	LIS_QUAD_SCALAR_MALLOC(tmpdot[1],8,1);
1280 	LIS_QUAD_SCALAR_MALLOC(tmpdot[2],9,1);
1281 	LIS_QUAD_SCALAR_MALLOC(tmpdot[3],10,1);
1282 	LIS_QUAD_SCALAR_MALLOC(tmpdot[4],11,1);
1283 	LIS_QUAD_SCALAR_MALLOC(one,13,1);
1284 
1285 
1286 	/* Initial Residual */
1287 	if( lis_solver_get_initial_residual(solver,NULL,NULL,r,&bnrm2) )
1288 	{
1289 		LIS_DEBUG_FUNC_OUT;
1290 		return LIS_SUCCESS;
1291 	}
1292 	tol     = solver->tol;
1293 
1294 	lis_solver_set_shadowresidual(solver,r,rtld);
1295 
1296 	lis_matvech(A,rtld,artld);
1297 	time = lis_wtime();
1298 	lis_psolve(solver, r, mr);
1299 	ptime += lis_wtime()-time;
1300 	lis_matvec(A,mr,amr);
1301 	lis_vector_dotex_mmm(rtld,amr,&rho_old);
1302 	lis_vector_copyex_mm(amr,ap);
1303 	lis_vector_copyex_mm(mr,p);
1304 	one.hi[0] = -1.0;
1305 	one.lo[0] = 0.0;
1306 
1307 
1308 	for( iter=1; iter<=maxiter; iter++ )
1309 	{
1310 		/* map  = M^-1 * ap */
1311 		time = lis_wtime();
1312 		lis_psolve(solver, ap, map);
1313 		ptime += lis_wtime()-time;
1314 
1315 		/* tmpdot[0] = <artld,map> */
1316 		/* alpha = rho_old / tmpdot[0] */
1317 		lis_vector_dotex_mmm(artld,map,&tmpdot[0]);
1318 		lis_quad_div((LIS_QUAD *)alpha.hi,(LIS_QUAD *)rho_old.hi,(LIS_QUAD *)tmpdot[0].hi);
1319 
1320 
1321 		/* tmpdot[0] = <y,y>           */
1322 		/* tmpdot[1] = <amr,r>         */
1323 		/* tmpdot[2] = <y,r>           */
1324 		/* tmpdot[3] = <amr,y>         */
1325 		/* tmpdot[4] = <amr,amr>       */
1326 		lis_vector_dotex_mmm(y,y,&tmpdot[0]);
1327 		lis_vector_dotex_mmm(amr,r,&tmpdot[1]);
1328 		lis_vector_dotex_mmm(y,r,&tmpdot[2]);
1329 		lis_vector_dotex_mmm(amr,y,&tmpdot[3]);
1330 		lis_vector_dotex_mmm(amr,amr,&tmpdot[4]);
1331 		if(iter==1)
1332 		{
1333 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[1].hi,(LIS_QUAD *)tmpdot[4].hi);
1334 			eta.hi[0] = 0.0;
1335 			eta.lo[0] = 0.0;
1336 		}
1337 		else
1338 		{
1339 			lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[0].hi);
1340 			lis_quad_sqr((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[3].hi);
1341 			lis_quad_sub((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)qsi.hi);
1342 
1343 			lis_quad_mul((LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[1].hi);
1344 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[2].hi,(LIS_QUAD *)tmpdot[3].hi);
1345 			lis_quad_sub((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)eta.hi);
1346 			lis_quad_div((LIS_QUAD *)qsi.hi,(LIS_QUAD *)qsi.hi,(LIS_QUAD *)tmp.hi);
1347 
1348 			lis_quad_mul((LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[4].hi,(LIS_QUAD *)tmpdot[2].hi);
1349 			lis_quad_mul((LIS_QUAD *)tmpdot[0].hi,(LIS_QUAD *)tmpdot[3].hi,(LIS_QUAD *)tmpdot[1].hi);
1350 			lis_quad_sub((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmpdot[0].hi);
1351 			lis_quad_div((LIS_QUAD *)eta.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)tmp.hi);
1352 		}
1353 
1354 		/* u    = qsi*map + eta*my + eta*beta*u */
1355 		/* au   = A * u                         */
1356 		lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)eta.hi,(LIS_QUAD *)beta.hi);
1357 		lis_vector_scaleex_mm(tmp,u);
1358 		lis_vector_axpyex_mmm(qsi,map,u);
1359 		lis_vector_axpyex_mmm(eta,my,u);
1360 		lis_matvec(A,u,au);
1361 
1362 		/* z = qsi*mr + eta*z - alpha*u */
1363 		lis_vector_scaleex_mm(eta,z);
1364 		lis_vector_axpyex_mmm(qsi,mr,z);
1365 		lis_quad_minus((LIS_QUAD *)alpha.hi);
1366 		lis_vector_axpyex_mmm(alpha,u,z);
1367 
1368 		/* y  = qsi*amr + eta*y - alpha*au */
1369 		/* my = M^-1 * y */
1370 		lis_vector_scaleex_mm(eta,y);
1371 		lis_vector_axpyex_mmm(qsi,amr,y);
1372 		lis_vector_axpyex_mmm(alpha,au,y);
1373 		time = lis_wtime();
1374 		lis_psolve(solver, y, my);
1375 		ptime += lis_wtime()-time;
1376 
1377 		/* x = x + alpha*p + z */
1378 		lis_quad_minus((LIS_QUAD *)alpha.hi);
1379 		lis_vector_axpyex_mmm(alpha,p,x);
1380 		lis_quad_minus((LIS_QUAD *)one.hi);
1381 		lis_vector_axpyex_mmm(one,z,x);
1382 
1383 		/* r = r - alpha*ap - y */
1384 		lis_quad_minus((LIS_QUAD *)alpha.hi);
1385 		lis_quad_minus((LIS_QUAD *)one.hi);
1386 		lis_vector_axpyex_mmm(alpha,ap,r);
1387 		lis_vector_axpyex_mmm(one,y,r);
1388 
1389 		/* convergence check */
1390 		lis_solver_get_residual[conv](r,solver,&nrm2);
1391 		if( output )
1392 		{
1393 			if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2;
1394 			if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2);
1395 		}
1396 
1397 		if( tol >= nrm2 )
1398 		{
1399 			solver->retcode    = LIS_SUCCESS;
1400 			solver->iter       = iter;
1401 			solver->resid      = nrm2;
1402 			solver->ptime      = ptime;
1403 			LIS_DEBUG_FUNC_OUT;
1404 			return LIS_SUCCESS;
1405 		}
1406 
1407 		/* mr  = mr - alpha*map - my */
1408 		/* amr = A * mr              */
1409 		/* rho = <rtld,amr> */
1410 		lis_vector_axpyex_mmm(alpha,map,mr);
1411 		lis_vector_axpyex_mmm(one,my,mr);
1412 		lis_matvec(A,mr,amr);
1413 		lis_vector_dotex_mmm(rtld,amr,&rho);
1414 		if( rho.hi[0]==0.0 && rho.lo[0]==0.0 )
1415 		{
1416 			solver->retcode   = LIS_BREAKDOWN;
1417 			solver->iter      = iter;
1418 			solver->resid     = nrm2;
1419 			LIS_DEBUG_FUNC_OUT;
1420 			return LIS_BREAKDOWN;
1421 		}
1422 
1423 		/* beta = (rho / rho_old) * (alpha / qsi) */
1424 		lis_quad_minus((LIS_QUAD *)alpha.hi);
1425 		lis_quad_div((LIS_QUAD *)beta.hi,(LIS_QUAD *)rho.hi,(LIS_QUAD *)rho_old.hi);
1426 		lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)alpha.hi,(LIS_QUAD *)qsi.hi);
1427 		lis_quad_mul((LIS_QUAD *)beta.hi,(LIS_QUAD *)beta.hi,(LIS_QUAD *)tmp.hi);
1428 
1429 
1430 		/* p  = mr + beta*(p - u)    */
1431 		/* ap = amr + beta*(ap - au) */
1432 		lis_vector_axpyex_mmm(one,u,p);
1433 		lis_vector_xpayex_mmm(mr,beta,p);
1434 		lis_vector_axpyex_mmm(one,au,ap);
1435 		lis_vector_xpayex_mmm(amr,beta,ap);
1436 
1437 		rho_old.hi[0] = rho.hi[0];
1438 		rho_old.lo[0] = rho.lo[0];
1439 	}
1440 
1441 	solver->retcode   = LIS_MAXITER;
1442 	solver->iter      = iter;
1443 	solver->resid     = nrm2;
1444 	LIS_DEBUG_FUNC_OUT;
1445 	return LIS_MAXITER;
1446 }
1447 #endif
1448