1c-----------------------------------------------------------------------
2c\BeginDoc
3c
4c\Name: pdnaitr
5c
6c Message Passing Layer: MPI
7c
8c\Description:
9c  Reverse communication interface for applying NP additional steps to
10c  a K step nonsymmetric Arnoldi factorization.
11c
12c  Input:  OP*V_{k}  -  V_{k}*H = r_{k}*e_{k}^T
13c
14c          with (V_{k}^T)*B*V_{k} = I, (V_{k}^T)*B*r_{k} = 0.
15c
16c  Output: OP*V_{k+p}  -  V_{k+p}*H = r_{k+p}*e_{k+p}^T
17c
18c          with (V_{k+p}^T)*B*V_{k+p} = I, (V_{k+p}^T)*B*r_{k+p} = 0.
19c
20c  where OP and B are as in pdnaupd.  The B-norm of r_{k+p} is also
21c  computed and returned.
22c
23c\Usage:
24c  call pdnaitr
25c     ( COMM, IDO, BMAT, N, K, NP, NB, RESID, RNORM, V, LDV, H, LDH,
26c       IPNTR, WORKD, WORKL, INFO )
27c
28c\Arguments
29c  COMM    MPI Communicator for the processor grid.  (INPUT)
30c
31c  IDO     Integer.  (INPUT/OUTPUT)
32c          Reverse communication flag.
33c          -------------------------------------------------------------
34c          IDO =  0: first call to the reverse communication interface
35c          IDO = -1: compute  Y = OP * X  where
36c                    IPNTR(1) is the pointer into WORK for X,
37c                    IPNTR(2) is the pointer into WORK for Y.
38c                    This is for the restart phase to force the new
39c                    starting vector into the range of OP.
40c          IDO =  1: compute  Y = OP * X  where
41c                    IPNTR(1) is the pointer into WORK for X,
42c                    IPNTR(2) is the pointer into WORK for Y,
43c                    IPNTR(3) is the pointer into WORK for B * X.
44c          IDO =  2: compute  Y = B * X  where
45c                    IPNTR(1) is the pointer into WORK for X,
46c                    IPNTR(2) is the pointer into WORK for Y.
47c          IDO = 99: done
48c          -------------------------------------------------------------
49c          When the routine is used in the "shift-and-invert" mode, the
50c          vector B * Q is already available and do not need to be
51c          recompute in forming OP * Q.
52c
53c  BMAT    Character*1.  (INPUT)
54c          BMAT specifies the type of the matrix B that defines the
55c          semi-inner product for the operator OP.  See pdnaupd.
56c          B = 'I' -> standard eigenvalue problem A*x = lambda*x
57c          B = 'G' -> generalized eigenvalue problem A*x = lambda*M**x
58c
59c  N       Integer.  (INPUT)
60c          Dimension of the eigenproblem.
61c
62c  K       Integer.  (INPUT)
63c          Current size of V and H.
64c
65c  NP      Integer.  (INPUT)
66c          Number of additional Arnoldi steps to take.
67c
68c  NB      Integer.  (INPUT)
69c          Blocksize to be used in the recurrence.
70c          Only work for NB = 1 right now.  The goal is to have a
71c          program that implement both the block and non-block method.
72c
73c  RESID   Double precision array of length N.  (INPUT/OUTPUT)
74c          On INPUT:  RESID contains the residual vector r_{k}.
75c          On OUTPUT: RESID contains the residual vector r_{k+p}.
76c
77c  RNORM   Double precision scalar.  (INPUT/OUTPUT)
78c          B-norm of the starting residual on input.
79c          B-norm of the updated residual r_{k+p} on output.
80c
81c  V       Double precision N by K+NP array.  (INPUT/OUTPUT)
82c          On INPUT:  V contains the Arnoldi vectors in the first K
83c          columns.
84c          On OUTPUT: V contains the new NP Arnoldi vectors in the next
85c          NP columns.  The first K columns are unchanged.
86c
87c  LDV     Integer.  (INPUT)
88c          Leading dimension of V exactly as declared in the calling
89c          program.
90c
91c  H       Double precision (K+NP) by (K+NP) array.  (INPUT/OUTPUT)
92c          H is used to store the generated upper Hessenberg matrix.
93c
94c  LDH     Integer.  (INPUT)
95c          Leading dimension of H exactly as declared in the calling
96c          program.
97c
98c  IPNTR   Integer array of length 3.  (OUTPUT)
99c          Pointer to mark the starting locations in the WORK for
100c          vectors used by the Arnoldi iteration.
101c          -------------------------------------------------------------
102c          IPNTR(1): pointer to the current operand vector X.
103c          IPNTR(2): pointer to the current result vector Y.
104c          IPNTR(3): pointer to the vector B * X when used in the
105c                    shift-and-invert mode.  X is the current operand.
106c          -------------------------------------------------------------
107c
108c  WORKD   Double precision work array of length 3*N.  (REVERSE COMMUNICATION)
109c          Distributed array to be used in the basic Arnoldi iteration
110c          for reverse communication.  The calling program should not
111c          use WORKD as temporary workspace during the iteration !!!!!!
112c          On input, WORKD(1:N) = B*RESID and is used to save some
113c          computation at the first step.
114c
115c  WORKL   Double precision work space used for Gram Schmidt orthogonalization
116c
117c  INFO    Integer.  (OUTPUT)
118c          = 0: Normal exit.
119c          > 0: Size of the spanning invariant subspace of OP found.
120c
121c\EndDoc
122c
123c-----------------------------------------------------------------------
124c
125c\BeginLib
126c
127c\Local variables:
128c     xxxxxx  real
129c
130c\References:
131c  1. D.C. Sorensen, "Implicit Application of Polynomial Filters in
132c     a k-Step Arnoldi Method", SIAM J. Matr. Anal. Apps., 13 (1992),
133c     pp 357-385.
134c  2. R.B. Lehoucq, "Analysis and Implementation of an Implicitly
135c     Restarted Arnoldi Iteration", Rice University Technical Report
136c     TR95-13, Department of Computational and Applied Mathematics.
137c
138c\Routines called:
139c     pdgetv0  Parallel ARPACK routine to generate the initial vector.
140c     pivout   Parallel ARPACK utility routine that prints integers.
141c     second   ARPACK utility routine for timing.
142c     pdmout   Parallel ARPACK utility routine that prints matrices
143c     pdvout   Parallel ARPACK utility routine that prints vectors.
144c     dlabad   LAPACK routine that computes machine constants.
145c     pdlamch  ScaLAPACK routine that determines machine constants.
146c     dlascl   LAPACK routine for careful scaling of a matrix.
147c     dlanhs   LAPACK routine that computes various norms of a matrix.
148c     dgemv    Level 2 BLAS routine for matrix vector multiplication.
149c     daxpy    Level 1 BLAS that computes a vector triad.
150c     dscal    Level 1 BLAS that scales a vector.
151c     dcopy    Level 1 BLAS that copies one vector to another .
152c     ddot     Level 1 BLAS that computes the scalar product of two vectors.
153c     pdnorm2  Parallel version of Level 1 BLAS that computes the norm of a vector.
154c
155c\Author
156c     Danny Sorensen               Phuong Vu
157c     Richard Lehoucq              CRPC / Rice University
158c     Dept. of Computational &     Houston, Texas
159c     Applied Mathematics
160c     Rice University
161c     Houston, Texas
162c
163c\Parallel Modifications
164c     Kristi Maschhoff
165c
166c\Revision history:
167c     Starting Point: Serial Code FILE: naitr.F   SID: 2.2
168c
169c\SCCS Information:
170c FILE: naitr.F   SID: 1.3   DATE OF SID: 3/19/97
171c
172c\Remarks
173c  The algorithm implemented is:
174c
175c  restart = .false.
176c  Given V_{k} = [v_{1}, ..., v_{k}], r_{k};
177c  r_{k} contains the initial residual vector even for k = 0;
178c  Also assume that rnorm = || B*r_{k} || and B*r_{k} are already
179c  computed by the calling program.
180c
181c  betaj = rnorm ; p_{k+1} = B*r_{k} ;
182c  For  j = k+1, ..., k+np  Do
183c     1) if ( betaj < tol ) stop or restart depending on j.
184c        ( At present tol is zero )
185c        if ( restart ) generate a new starting vector.
186c     2) v_{j} = r(j-1)/betaj;  V_{j} = [V_{j-1}, v_{j}];
187c        p_{j} = p_{j}/betaj
188c     3) r_{j} = OP*v_{j} where OP is defined as in pdnaupd
189c        For shift-invert mode p_{j} = B*v_{j} is already available.
190c        wnorm = || OP*v_{j} ||
191c     4) Compute the j-th step residual vector.
192c        w_{j} =  V_{j}^T * B * OP * v_{j}
193c        r_{j} =  OP*v_{j} - V_{j} * w_{j}
194c        H(:,j) = w_{j};
195c        H(j,j-1) = rnorm
196c        rnorm = || r_(j) ||
197c        If (rnorm > 0.717*wnorm) accept step and go back to 1)
198c     5) Re-orthogonalization step:
199c        s = V_{j}'*B*r_{j}
200c        r_{j} = r_{j} - V_{j}*s;  rnorm1 = || r_{j} ||
201c        alphaj = alphaj + s_{j};
202c     6) Iterative refinement step:
203c        If (rnorm1 > 0.717*rnorm) then
204c           rnorm = rnorm1
205c           accept step and go back to 1)
206c        Else
207c           rnorm = rnorm1
208c           If this is the first time in step 6), go to 5)
209c           Else r_{j} lies in the span of V_{j} numerically.
210c              Set r_{j} = 0 and rnorm = 0; go to 1)
211c        EndIf
212c  End Do
213c
214c\EndLib
215c
216c-----------------------------------------------------------------------
217c
218      subroutine pdnaitr
219     &   (comm, ido, bmat, n, k, np, nb, resid, rnorm, v, ldv, h, ldh,
220     &    ipntr, workd, workl, info)
221c
222      include   'mpif.h'
223c
224c     %---------------%
225c     | MPI Variables |
226c     %---------------%
227c
228      integer    comm
229c
230c     %----------------------------------------------------%
231c     | Include files for debugging and timing information |
232c     %----------------------------------------------------%
233c
234      include   'debug.h'
235      include   'stat.h'
236c
237c     %------------------%
238c     | Scalar Arguments |
239c     %------------------%
240c
241      character  bmat*1
242      integer    ido, info, k, ldh, ldv, n, nb, np
243      Double precision
244     &           rnorm
245c
246c     %-----------------%
247c     | Array Arguments |
248c     %-----------------%
249c
250      integer    ipntr(3)
251      Double precision
252     &           h(ldh,k+np), resid(n), v(ldv,k+np), workd(3*n),
253     &           workl(2*ldh)
254c
255c     %------------%
256c     | Parameters |
257c     %------------%
258c
259      Double precision
260     &           one, zero
261      parameter (one = 1.0, zero = 0.0)
262c
263c     %---------------%
264c     | Local Scalars |
265c     %---------------%
266c
267      logical    first, orth1, orth2, rstart, step3, step4
268      integer    ierr, i, infol, ipj, irj, ivj, iter, itry, j, msglvl,
269     &           jj
270      Double precision
271     &           betaj, ovfl, temp1, rnorm1, smlnum, tst1, ulp, unfl,
272     &           wnorm
273      save       first, orth1, orth2, rstart, step3, step4,
274     &           ierr, ipj, irj, ivj, iter, itry, j, msglvl, ovfl,
275     &           betaj, rnorm1, smlnum, ulp, unfl, wnorm
276c
277      Double precision
278     &           rnorm_buf
279c
280c
281c     %-----------------------%
282c     | Local Array Arguments |
283c     %-----------------------%
284c
285      Double precision
286     &           xtemp(2)
287c
288c     %----------------------%
289c     | External Subroutines |
290c     %----------------------%
291c
292      external   daxpy, dcopy, dscal, dgemv, pdgetv0, dlabad,
293     &           pdvout, pdmout, pivout, second
294c
295c     %--------------------%
296c     | External Functions |
297c     %--------------------%
298c
299      Double precision
300     &           ddot, pdnorm2, dlanhs, pdlamch10
301      external   ddot, pdnorm2, dlanhs, pdlamch10
302c
303c     %---------------------%
304c     | Intrinsic Functions |
305c     %---------------------%
306c
307      intrinsic    abs, sqrt
308c
309c     %-----------------%
310c     | Data statements |
311c     %-----------------%
312c
313      data      first / .true. /
314c
315c     %-----------------------%
316c     | Executable Statements |
317c     %-----------------------%
318c
319      if (first) then
320c
321c        %-----------------------------------------%
322c        | Set machine-dependent constants for the |
323c        | the splitting and deflation criterion.  |
324c        | If norm(H) <= sqrt(OVFL),               |
325c        | overflow should not occur.              |
326c        | REFERENCE: LAPACK subroutine dlahqr     |
327c        %-----------------------------------------%
328c
329         unfl = pdlamch10(comm, 'safe minimum' )
330         ovfl = one / unfl
331         call dlabad( unfl, ovfl )
332         ulp = pdlamch10( comm, 'precision' )
333         smlnum = unfl*( n / ulp )
334         first = .false.
335      end if
336c
337      if (ido .eq. 0) then
338c
339c        %-------------------------------%
340c        | Initialize timing statistics  |
341c        | & message level for debugging |
342c        %-------------------------------%
343c
344         call second (t0)
345         msglvl = mnaitr
346c
347c        %------------------------------%
348c        | Initial call to this routine |
349c        %------------------------------%
350c
351         info   = 0
352         step3  = .false.
353         step4  = .false.
354         rstart = .false.
355         orth1  = .false.
356         orth2  = .false.
357         j      = k + 1
358         ipj    = 1
359         irj    = ipj   + n
360         ivj    = irj   + n
361      end if
362c
363c     %-------------------------------------------------%
364c     | When in reverse communication mode one of:      |
365c     | STEP3, STEP4, ORTH1, ORTH2, RSTART              |
366c     | will be .true. when ....                        |
367c     | STEP3: return from computing OP*v_{j}.          |
368c     | STEP4: return from computing B-norm of OP*v_{j} |
369c     | ORTH1: return from computing B-norm of r_{j+1}  |
370c     | ORTH2: return from computing B-norm of          |
371c     |        correction to the residual vector.       |
372c     | RSTART: return from OP computations needed by   |
373c     |         pdgetv0.                                |
374c     %-------------------------------------------------%
375c
376      if (step3)  go to 50
377      if (step4)  go to 60
378      if (orth1)  go to 70
379      if (orth2)  go to 90
380      if (rstart) go to 30
381c
382c     %-----------------------------%
383c     | Else this is the first step |
384c     %-----------------------------%
385c
386c     %--------------------------------------------------------------%
387c     |                                                              |
388c     |        A R N O L D I     I T E R A T I O N     L O O P       |
389c     |                                                              |
390c     | Note:  B*r_{j-1} is already in WORKD(1:N)=WORKD(IPJ:IPJ+N-1) |
391c     %--------------------------------------------------------------%
392
393 1000 continue
394c
395         if (msglvl .gt. 1) then
396            call pivout (comm, logfil, 1, j, ndigit,
397     &                  '_naitr: generating Arnoldi vector number')
398            call pdvout (comm, logfil, 1, rnorm, ndigit,
399     &                  '_naitr: B-norm of the current residual is')
400         end if
401c
402c        %---------------------------------------------------%
403c        | STEP 1: Check if the B norm of j-th residual      |
404c        | vector is zero. Equivalent to determing whether   |
405c        | an exact j-step Arnoldi factorization is present. |
406c        %---------------------------------------------------%
407c
408         betaj = rnorm
409         if (rnorm .gt. zero) go to 40
410c
411c           %---------------------------------------------------%
412c           | Invariant subspace found, generate a new starting |
413c           | vector which is orthogonal to the current Arnoldi |
414c           | basis and continue the iteration.                 |
415c           %---------------------------------------------------%
416c
417            if (msglvl .gt. 0) then
418               call pivout (comm, logfil, 1, j, ndigit,
419     &                     '_naitr: ****** RESTART AT STEP ******')
420            end if
421c
422c           %---------------------------------------------%
423c           | ITRY is the loop variable that controls the |
424c           | maximum amount of times that a restart is   |
425c           | attempted. NRSTRT is used by stat.h         |
426c           %---------------------------------------------%
427c
428            betaj  = zero
429            nrstrt = nrstrt + 1
430            itry   = 1
431   20       continue
432            rstart = .true.
433            ido    = 0
434   30       continue
435c
436c           %--------------------------------------%
437c           | If in reverse communication mode and |
438c           | RSTART = .true. flow returns here.   |
439c           %--------------------------------------%
440c
441            call pdgetv0 ( comm, ido, bmat, itry, .false., n, j, v, ldv,
442     &                     resid, rnorm, ipntr, workd, workl, ierr)
443            if (ido .ne. 99) go to 9000
444            if (ierr .lt. 0) then
445               itry = itry + 1
446               if (itry .le. 3) go to 20
447c
448c              %------------------------------------------------%
449c              | Give up after several restart attempts.        |
450c              | Set INFO to the size of the invariant subspace |
451c              | which spans OP and exit.                       |
452c              %------------------------------------------------%
453c
454               info = j - 1
455               call second (t1)
456               tnaitr = tnaitr + (t1 - t0)
457               ido = 99
458               go to 9000
459            end if
460c
461   40    continue
462c
463c        %---------------------------------------------------------%
464c        | STEP 2:  v_{j} = r_{j-1}/rnorm and p_{j} = p_{j}/rnorm  |
465c        | Note that p_{j} = B*r_{j-1}. In order to avoid overflow |
466c        | when reciprocating a small RNORM, test against lower    |
467c        | machine bound.                                          |
468c        %---------------------------------------------------------%
469c
470         call dcopy (n, resid, 1, v(1,j), 1)
471         if (rnorm .ge. unfl) then
472             temp1 = one / rnorm
473             call dscal (n, temp1, v(1,j), 1)
474             call dscal (n, temp1, workd(ipj), 1)
475         else
476c
477c            %-----------------------------------------%
478c            | To scale both v_{j} and p_{j} carefully |
479c            | use LAPACK routine SLASCL               |
480c            %-----------------------------------------%
481c
482             call dlascl ('General', i, i, rnorm, one, n, 1,
483     &                    v(1,j), n, infol)
484             call dlascl ('General', i, i, rnorm, one, n, 1,
485     &                    workd(ipj), n, infol)
486         end if
487c
488c        %------------------------------------------------------%
489c        | STEP 3:  r_{j} = OP*v_{j}; Note that p_{j} = B*v_{j} |
490c        | Note that this is not quite yet r_{j}. See STEP 4    |
491c        %------------------------------------------------------%
492c
493         step3 = .true.
494         nopx  = nopx + 1
495         call second (t2)
496         call dcopy (n, v(1,j), 1, workd(ivj), 1)
497         ipntr(1) = ivj
498         ipntr(2) = irj
499         ipntr(3) = ipj
500         ido = 1
501c
502c        %-----------------------------------%
503c        | Exit in order to compute OP*v_{j} |
504c        %-----------------------------------%
505c
506         go to 9000
507   50    continue
508c
509c        %----------------------------------%
510c        | Back from reverse communication; |
511c        | WORKD(IRJ:IRJ+N-1) := OP*v_{j}   |
512c        | if step3 = .true.                |
513c        %----------------------------------%
514c
515         call second (t3)
516         tmvopx = tmvopx + (t3 - t2)
517
518         step3 = .false.
519c
520c        %------------------------------------------%
521c        | Put another copy of OP*v_{j} into RESID. |
522c        %------------------------------------------%
523c
524         call dcopy (n, workd(irj), 1, resid, 1)
525c
526c        %---------------------------------------%
527c        | STEP 4:  Finish extending the Arnoldi |
528c        |          factorization to length j.   |
529c        %---------------------------------------%
530c
531         call second (t2)
532         if (bmat .eq. 'G') then
533            nbx = nbx + 1
534            step4 = .true.
535            ipntr(1) = irj
536            ipntr(2) = ipj
537            ido = 2
538c
539c           %-------------------------------------%
540c           | Exit in order to compute B*OP*v_{j} |
541c           %-------------------------------------%
542c
543            go to 9000
544         else if (bmat .eq. 'I') then
545            call dcopy (n, resid, 1, workd(ipj), 1)
546         end if
547   60    continue
548c
549c        %----------------------------------%
550c        | Back from reverse communication; |
551c        | WORKD(IPJ:IPJ+N-1) := B*OP*v_{j} |
552c        | if step4 = .true.                |
553c        %----------------------------------%
554c
555         if (bmat .eq. 'G') then
556            call second (t3)
557            tmvbx = tmvbx + (t3 - t2)
558         end if
559c
560         step4 = .false.
561c
562c        %-------------------------------------%
563c        | The following is needed for STEP 5. |
564c        | Compute the B-norm of OP*v_{j}.     |
565c        %-------------------------------------%
566c
567         if (bmat .eq. 'G') then
568            rnorm_buf = ddot (n, resid, 1, workd(ipj), 1)
569            call MPI_ALLREDUCE( rnorm_buf, wnorm, 1,
570     &           MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr )
571            wnorm = sqrt(abs(wnorm))
572         else if (bmat .eq. 'I') then
573            wnorm = pdnorm2( comm, n, resid, 1 )
574         end if
575c
576c        %-----------------------------------------%
577c        | Compute the j-th residual corresponding |
578c        | to the j step factorization.            |
579c        | Use Classical Gram Schmidt and compute: |
580c        | w_{j} <-  V_{j}^T * B * OP * v_{j}      |
581c        | r_{j} <-  OP*v_{j} - V_{j} * w_{j}      |
582c        %-----------------------------------------%
583c
584c
585c        %------------------------------------------%
586c        | Compute the j Fourier coefficients w_{j} |
587c        | WORKD(IPJ:IPJ+N-1) contains B*OP*v_{j}.  |
588c        %------------------------------------------%
589c
590         call dgemv ('T', n, j, one, v, ldv, workd(ipj), 1,
591     &               zero, workl, 1)
592         call MPI_ALLREDUCE( workl, h(1,j), j,
593     &               MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr)
594c
595c        %--------------------------------------%
596c        | Orthogonalize r_{j} against V_{j}.   |
597c        | RESID contains OP*v_{j}. See STEP 3. |
598c        %--------------------------------------%
599c
600         call dgemv ('N', n, j, -one, v, ldv, h(1,j), 1,
601     &               one, resid, 1)
602c
603         if (j .gt. 1) h(j,j-1) = betaj
604c
605         call second (t4)
606c
607         orth1 = .true.
608c
609         call second (t2)
610         if (bmat .eq. 'G') then
611            nbx = nbx + 1
612            call dcopy (n, resid, 1, workd(irj), 1)
613            ipntr(1) = irj
614            ipntr(2) = ipj
615            ido = 2
616c
617c           %----------------------------------%
618c           | Exit in order to compute B*r_{j} |
619c           %----------------------------------%
620c
621            go to 9000
622         else if (bmat .eq. 'I') then
623            call dcopy (n, resid, 1, workd(ipj), 1)
624         end if
625   70    continue
626c
627c        %---------------------------------------------------%
628c        | Back from reverse communication if ORTH1 = .true. |
629c        | WORKD(IPJ:IPJ+N-1) := B*r_{j}.                    |
630c        %---------------------------------------------------%
631c
632         if (bmat .eq. 'G') then
633            call second (t3)
634            tmvbx = tmvbx + (t3 - t2)
635         end if
636c
637         orth1 = .false.
638c
639c        %------------------------------%
640c        | Compute the B-norm of r_{j}. |
641c        %------------------------------%
642c
643         if (bmat .eq. 'G') then
644            rnorm_buf = ddot (n, resid, 1, workd(ipj), 1)
645            call MPI_ALLREDUCE( rnorm_buf, rnorm, 1,
646     &           MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr )
647            rnorm = sqrt(abs(rnorm))
648         else if (bmat .eq. 'I') then
649            rnorm = pdnorm2( comm, n, resid, 1 )
650         end if
651c
652c        %-----------------------------------------------------------%
653c        | STEP 5: Re-orthogonalization / Iterative refinement phase |
654c        | Maximum NITER_ITREF tries.                                |
655c        |                                                           |
656c        |          s      = V_{j}^T * B * r_{j}                     |
657c        |          r_{j}  = r_{j} - V_{j}*s                         |
658c        |          alphaj = alphaj + s_{j}                          |
659c        |                                                           |
660c        | The stopping criteria used for iterative refinement is    |
661c        | discussed in Parlett's book SEP, page 107 and in Gragg &  |
662c        | Reichel ACM TOMS paper; Algorithm 686, Dec. 1990.         |
663c        | Determine if we need to correct the residual. The goal is |
664c        | to enforce ||v(:,1:j)^T * r_{j}|| .le. eps * || r_{j} ||  |
665c        | The following test determines whether the sine of the     |
666c        | angle between  OP*x and the computed residual is less     |
667c        | than or equal to 0.717.                                   |
668c        %-----------------------------------------------------------%
669c
670         if (rnorm .gt. 0.717*wnorm) go to 100
671         iter  = 0
672         nrorth = nrorth + 1
673c
674c        %---------------------------------------------------%
675c        | Enter the Iterative refinement phase. If further  |
676c        | refinement is necessary, loop back here. The loop |
677c        | variable is ITER. Perform a step of Classical     |
678c        | Gram-Schmidt using all the Arnoldi vectors V_{j}  |
679c        %---------------------------------------------------%
680c
681   80    continue
682c
683         if (msglvl .gt. 2) then
684            xtemp(1) = wnorm
685            xtemp(2) = rnorm
686            call pdvout (comm, logfil, 2, xtemp, ndigit,
687     &           '_naitr: re-orthonalization; wnorm and rnorm are')
688            call pdvout (comm, logfil, j, h(1,j), ndigit,
689     &                  '_naitr: j-th column of H')
690         end if
691c
692c        %----------------------------------------------------%
693c        | Compute V_{j}^T * B * r_{j}.                       |
694c        | WORKD(IRJ:IRJ+J-1) = v(:,1:J)'*WORKD(IPJ:IPJ+N-1). |
695c        %----------------------------------------------------%
696c
697         call dgemv ('T', n, j, one, v, ldv, workd(ipj), 1,
698     &               zero, workl(j+1), 1)
699         call MPI_ALLREDUCE( workl(j+1), workl(1), j,
700     &               MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr)
701c
702c        %---------------------------------------------%
703c        | Compute the correction to the residual:     |
704c        | r_{j} = r_{j} - V_{j} * WORKD(IRJ:IRJ+J-1). |
705c        | The correction to H is v(:,1:J)*H(1:J,1:J)  |
706c        | + v(:,1:J)*WORKD(IRJ:IRJ+J-1)*e'_j.         |
707c        %---------------------------------------------%
708c
709         call dgemv ('N', n, j, -one, v, ldv, workl(1), 1,
710     &               one, resid, 1)
711         call daxpy (j, one, workl(1), 1, h(1,j), 1)
712c
713         orth2 = .true.
714         call second (t2)
715         if (bmat .eq. 'G') then
716            nbx = nbx + 1
717            call dcopy (n, resid, 1, workd(irj), 1)
718            ipntr(1) = irj
719            ipntr(2) = ipj
720            ido = 2
721c
722c           %-----------------------------------%
723c           | Exit in order to compute B*r_{j}. |
724c           | r_{j} is the corrected residual.  |
725c           %-----------------------------------%
726c
727            go to 9000
728         else if (bmat .eq. 'I') then
729            call dcopy (n, resid, 1, workd(ipj), 1)
730         end if
731   90    continue
732c
733c        %---------------------------------------------------%
734c        | Back from reverse communication if ORTH2 = .true. |
735c        %---------------------------------------------------%
736c
737         if (bmat .eq. 'G') then
738            call second (t3)
739            tmvbx = tmvbx + (t3 - t2)
740         end if
741c
742c        %-----------------------------------------------------%
743c        | Compute the B-norm of the corrected residual r_{j}. |
744c        %-----------------------------------------------------%
745c
746         if (bmat .eq. 'G') then
747           rnorm_buf = ddot (n, resid, 1, workd(ipj), 1)
748           call MPI_ALLREDUCE( rnorm_buf, rnorm1, 1,
749     &          MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr )
750           rnorm1 = sqrt(abs(rnorm1))
751         else if (bmat .eq. 'I') then
752           rnorm1 = pdnorm2( comm, n, resid, 1 )
753         end if
754c
755         if (msglvl .gt. 0 .and. iter .gt. 0) then
756            call pivout (comm, logfil, 1, j, ndigit,
757     &           '_naitr: Iterative refinement for Arnoldi residual')
758            if (msglvl .gt. 2) then
759                xtemp(1) = rnorm
760                xtemp(2) = rnorm1
761                call pdvout (comm, logfil, 2, xtemp, ndigit,
762     &           '_naitr: iterative refinement ; rnorm and rnorm1 are')
763            end if
764         end if
765c
766c        %-----------------------------------------%
767c        | Determine if we need to perform another |
768c        | step of re-orthogonalization.           |
769c        %-----------------------------------------%
770c
771         if (rnorm1 .gt. 0.717*rnorm) then
772c
773c           %---------------------------------------%
774c           | No need for further refinement.       |
775c           | The cosine of the angle between the   |
776c           | corrected residual vector and the old |
777c           | residual vector is greater than 0.717 |
778c           | In other words the corrected residual |
779c           | and the old residual vector share an  |
780c           | angle of less than arcCOS(0.717)      |
781c           %---------------------------------------%
782c
783            rnorm = rnorm1
784c
785         else
786c
787c           %-------------------------------------------%
788c           | Another step of iterative refinement step |
789c           | is required. NITREF is used by stat.h     |
790c           %-------------------------------------------%
791c
792            nitref = nitref + 1
793            rnorm  = rnorm1
794            iter   = iter + 1
795            if (iter .le. 1) go to 80
796c
797c           %-------------------------------------------------%
798c           | Otherwise RESID is numerically in the span of V |
799c           %-------------------------------------------------%
800c
801            do 95 jj = 1, n
802               resid(jj) = zero
803  95        continue
804            rnorm = zero
805         end if
806c
807c        %----------------------------------------------%
808c        | Branch here directly if iterative refinement |
809c        | wasn't necessary or after at most NITER_REF  |
810c        | steps of iterative refinement.               |
811c        %----------------------------------------------%
812c
813  100    continue
814c
815         rstart = .false.
816         orth2  = .false.
817c
818         call second (t5)
819         titref = titref + (t5 - t4)
820c
821c        %------------------------------------%
822c        | STEP 6: Update  j = j+1;  Continue |
823c        %------------------------------------%
824c
825         j = j + 1
826         if (j .gt. k+np) then
827            call second (t1)
828            tnaitr = tnaitr + (t1 - t0)
829            ido = 99
830            do 110 i = max(1,k), k+np-1
831c
832c              %--------------------------------------------%
833c              | Check for splitting and deflation.         |
834c              | Use a standard test as in the QR algorithm |
835c              | REFERENCE: LAPACK subroutine dlahqr        |
836c              %--------------------------------------------%
837c
838               tst1 = abs( h( i, i ) ) + abs( h( i+1, i+1 ) )
839               if( tst1.eq.zero )
840     &              tst1 = dlanhs( '1', k+np, h, ldh, workd(n+1) )
841               if( abs( h( i+1,i ) ).le.max( ulp*tst1, smlnum ) )
842     &              h(i+1,i) = zero
843 110        continue
844c
845            if (msglvl .gt. 2) then
846               call pdmout (comm, logfil, k+np, k+np, h, ldh, ndigit,
847     &          '_naitr: Final upper Hessenberg matrix H of order K+NP')
848            end if
849c
850            go to 9000
851         end if
852c
853c        %--------------------------------------------------------%
854c        | Loop back to extend the factorization by another step. |
855c        %--------------------------------------------------------%
856c
857      go to 1000
858c
859c     %---------------------------------------------------------------%
860c     |                                                               |
861c     |  E N D     O F     M A I N     I T E R A T I O N     L O O P  |
862c     |                                                               |
863c     %---------------------------------------------------------------%
864c
865 9000 continue
866      return
867c
868c     %----------------%
869c     | End of pdnaitr |
870c     %----------------%
871c
872      end
873