1c\BeginDoc
2c
3c\Name: pznapps
4c
5c Message Passing Layer: BLACS
6c
7c\Description:
8c  Given the Arnoldi factorization
9c
10c     A*V_{k} - V_{k}*H_{k} = r_{k+p}*e_{k+p}^T,
11c
12c  apply NP implicit shifts resulting in
13c
14c     A*(V_{k}*Q) - (V_{k}*Q)*(Q^T* H_{k}*Q) = r_{k+p}*e_{k+p}^T * Q
15c
16c  where Q is an orthogonal matrix which is the product of rotations
17c  and reflections resulting from the NP bulge change sweeps.
18c  The updated Arnoldi factorization becomes:
19c
20c     A*VNEW_{k} - VNEW_{k}*HNEW_{k} = rnew_{k}*e_{k}^T.
21c
22c\Usage:
23c  call pznapps
24c     ( COMM, N, KEV, NP, SHIFT, V, LDV, H, LDH, RESID, Q, LDQ,
25c       WORKL, WORKD )
26c
27c\Arguments
28c  COMM    BLACS Communicator for the processor grid.  (INPUT)
29c
30c  N       Integer.  (INPUT)
31c          Problem size, i.e. size of matrix A.
32c
33c  KEV     Integer.  (INPUT/OUTPUT)
34c          KEV+NP is the size of the input matrix H.
35c          KEV is the size of the updated matrix HNEW.
36c
37c  NP      Integer.  (INPUT)
38c          Number of implicit shifts to be applied.
39c
40c  SHIFT   Complex*16 array of length NP.  (INPUT)
41c          The shifts to be applied.
42c
43c  V       Complex*16 N by (KEV+NP) array.  (INPUT/OUTPUT)
44c          On INPUT, V contains the current KEV+NP Arnoldi vectors.
45c          On OUTPUT, V contains the updated KEV Arnoldi vectors
46c          in the first KEV columns of V.
47c
48c  LDV     Integer.  (INPUT)
49c          Leading dimension of V exactly as declared in the calling
50c          program.
51c
52c  H       Complex*16 (KEV+NP) by (KEV+NP) array.  (INPUT/OUTPUT)
53c          On INPUT, H contains the current KEV+NP by KEV+NP upper
54c          Hessenberg matrix of the Arnoldi factorization.
55c          On OUTPUT, H contains the updated KEV by KEV upper Hessenberg
56c          matrix in the KEV leading submatrix.
57c
58c  LDH     Integer.  (INPUT)
59c          Leading dimension of H exactly as declared in the calling
60c          program.
61c
62c  RESID   Complex*16 array of length N.  (INPUT/OUTPUT)
63c          On INPUT, RESID contains the the residual vector r_{k+p}.
64c          On OUTPUT, RESID is the update residual vector rnew_{k}
65c          in the first KEV locations.
66c
67c  Q       Complex*16 KEV+NP by KEV+NP work array.  (WORKSPACE)
68c          Work array used to accumulate the rotations and reflections
69c          during the bulge chase sweep.
70c
71c  LDQ     Integer.  (INPUT)
72c          Leading dimension of Q exactly as declared in the calling
73c          program.
74c
75c  WORKL   Complex*16 work array of length (KEV+NP).  (WORKSPACE)
76c          Private (replicated) array on each PE or array allocated on
77c          the front end.
78c
79c  WORKD   Complex*16 work array of length 2*N.  (WORKSPACE)
80c          Distributed array used in the application of the accumulated
81c          orthogonal matrix Q.
82c
83c\EndDoc
84c
85c-----------------------------------------------------------------------
86c
87c\BeginLib
88c
89c\Local variables:
90c     xxxxxx  Complex*16
91c
92c\References:
93c  1. D.C. Sorensen, "Implicit Application of Polynomial Filters in
94c     a k-Step Arnoldi Method", SIAM J. Matr. Anal. Apps., 13 (1992),
95c     pp 357-385.
96c
97c\Routines called:
98c     pivout  Parallel ARPACK utility routine that prints integers.
99c     arscnd  ARPACK utility routine for timing.
100c     pzmout  Parallel ARPACK utility routine that prints matrices
101c     pzvout  Parallel ARPACK utility routine that prints vectors.
102c     zlacpy  LAPACK matrix copy routine.
103c     zlanhs  LAPACK routine that computes various norms of a matrix.
104c     zlartg  LAPACK Givens rotation construction routine.
105c     zlaset  LAPACK matrix initialization routine.
106c     dlabad  LAPACK routine for defining the underflow and overflow
107c             limits.
108c     pdlamch ScaLAPACK routine that determines machine constants.
109c     dlapy2  LAPACK routine to compute sqrt(x**2+y**2) carefully.
110c     zgemv   Level 2 BLAS routine for matrix vector multiplication.
111c     zaxpy   Level 1 BLAS that computes a vector triad.
112c     zcopy   Level 1 BLAS that copies one vector to another.
113c     zscal   Level 1 BLAS that scales a vector.
114c
115c\Author
116c     Danny Sorensen               Phuong Vu
117c     Richard Lehoucq              CRPC / Rice University
118c     Dept. of Computational &     Houston, Texas
119c     Applied Mathematics
120c     Rice University
121c     Houston, Texas
122c
123c\Parallel Modifications
124c     Kristi Maschhoff
125c
126c\Revision history:
127c     Starting Point: Serial Complex Code FILE: napps.F   SID: 2.1
128c
129c\SCCS Information:
130c FILE: napps.F   SID: 1.4   DATE OF SID: 10/25/03
131c
132c\Remarks
133c  1. In this version, each shift is applied to all the sublocks of
134c     the Hessenberg matrix H and not just to the submatrix that it
135c     comes from. Deflation as in LAPACK routine zlahqr (QR algorithm
136c     for upper Hessenberg matrices ) is used.
137c     Upon output, the subdiagonals of H are enforced to be non-negative
138c     real numbers.
139c
140c\EndLib
141c
142c-----------------------------------------------------------------------
143c
144      subroutine pznapps
145     &   ( comm, n, kev, np, shift, v, ldv, h, ldh, resid, q, ldq,
146     &     workl, workd )
147c
148c     %--------------------%
149c     | BLACS Communicator |
150c     %--------------------%
151c
152      integer   comm
153c
154c     %----------------------------------------------------%
155c     | Include files for debugging and timing information |
156c     %----------------------------------------------------%
157c
158      include   'debug.h'
159      include   'stat.h'
160c
161c     %------------------%
162c     | Scalar Arguments |
163c     %------------------%
164c
165      integer    kev, ldh, ldq, ldv, n, np
166c
167c     %-----------------%
168c     | Array Arguments |
169c     %-----------------%
170c
171      Complex*16
172     &           h(ldh,kev+np), resid(n), shift(np),
173     &           v(ldv,kev+np), q(ldq,kev+np), workd(2*n), workl(kev+np)
174c
175c     %------------%
176c     | Parameters |
177c     %------------%
178c
179      Complex*16
180     &           one, zero
181      Double precision
182     &           rzero
183      parameter (one = (1.0, 0.0), zero = (0.0, 0.0),
184     &           rzero = 0.0)
185c
186c     %------------------------%
187c     | Local Scalars & Arrays |
188c     %------------------------%
189c
190      integer    i, iend, istart, j, jj, kplusp, msglvl
191      logical    first
192      Complex*16
193     &           cdum, f, g, h11, h21, r, s, sigma, t
194      Double precision
195     &           c,  ovfl, smlnum, ulp, unfl, tst1
196      save       first, ovfl, smlnum, ulp, unfl
197c
198c     %----------------------%
199c     | External Subroutines |
200c     %----------------------%
201c
202      external   zaxpy, zcopy, zgemv, zscal, zlacpy, zlartg,
203     &           pzvout, zlaset, dlabad, pzmout, arscnd, pivout
204c
205c     %--------------------%
206c     | External Functions |
207c     %--------------------%
208c
209      Double precision
210     &           zlanhs, pdlamch, dlapy2
211      external   zlanhs, pdlamch, dlapy2
212c
213c     %----------------------%
214c     | Intrinsics Functions |
215c     %----------------------%
216c
217      intrinsic  abs, dimag, conjg, dcmplx, max, min, dble
218c
219c     %---------------------%
220c     | Statement Functions |
221c     %---------------------%
222c
223      Double precision
224     &           cabs1
225      cabs1( cdum ) = abs( dble( cdum ) ) + abs( dimag( cdum ) )
226c
227c     %----------------%
228c     | Data statements |
229c     %----------------%
230c
231      data       first / .true. /
232c
233c     %-----------------------%
234c     | Executable Statements |
235c     %-----------------------%
236c
237      if (first) then
238c
239c        %-----------------------------------------------%
240c        | Set machine-dependent constants for the       |
241c        | stopping criterion. If norm(H) <= sqrt(OVFL), |
242c        | overflow should not occur.                    |
243c        | REFERENCE: LAPACK subroutine zlahqr           |
244c        %-----------------------------------------------%
245c
246         unfl = pdlamch( comm, 'safe minimum' )
247         ovfl = dble(one / unfl)
248         call dlabad( unfl, ovfl )
249         ulp = dlamch( 'precision' )
250         smlnum = unfl*( n / ulp )
251         first = .false.
252      end if
253c
254c     %-------------------------------%
255c     | Initialize timing statistics  |
256c     | & message level for debugging |
257c     %-------------------------------%
258c
259      call arscnd (t0)
260      msglvl = mcapps
261c
262      kplusp = kev + np
263c
264c     %--------------------------------------------%
265c     | Initialize Q to the identity to accumulate |
266c     | the rotations and reflections              |
267c     %--------------------------------------------%
268c
269      call zlaset ('All', kplusp, kplusp, zero, one, q, ldq)
270c
271c     %----------------------------------------------%
272c     | Quick return if there are no shifts to apply |
273c     %----------------------------------------------%
274c
275      if (np .eq. 0) go to 9000
276c
277c     %----------------------------------------------%
278c     | Chase the bulge with the application of each |
279c     | implicit shift. Each shift is applied to the |
280c     | whole matrix including each block.           |
281c     %----------------------------------------------%
282c
283      do 110 jj = 1, np
284         sigma = shift(jj)
285c
286         if (msglvl .gt. 2 ) then
287            call pivout (comm, logfil, 1, [jj], ndigit,
288     &               '_napps: shift number.')
289            call pzvout (comm, logfil, 1, [sigma], ndigit,
290     &               '_napps: Value of the shift ')
291         end if
292c
293         istart = 1
294   20    continue
295c
296         do 30 i = istart, kplusp-1
297c
298c           %----------------------------------------%
299c           | Check for splitting and deflation. Use |
300c           | a standard test as in the QR algorithm |
301c           | REFERENCE: LAPACK subroutine zlahqr    |
302c           %----------------------------------------%
303c
304            tst1 = cabs1( h( i, i ) ) + cabs1( h( i+1, i+1 ) )
305            if( tst1.eq.rzero )
306     &         tst1 = zlanhs( '1', kplusp-jj+1, h, ldh, workl )
307            if ( abs(dble(h(i+1,i)))
308     &           .le. max(ulp*tst1, smlnum) )  then
309               if (msglvl .gt. 0) then
310                  call pivout (comm, logfil, 1, [i], ndigit,
311     &                 '_napps: matrix splitting at row/column no.')
312                  call pivout (comm, logfil, 1, [jj], ndigit,
313     &                 '_napps: matrix splitting with shift number.')
314                  call pzvout (comm, logfil, 1, h(i+1,i), ndigit,
315     &                 '_napps: off diagonal element.')
316               end if
317               iend = i
318               h(i+1,i) = zero
319               go to 40
320            end if
321   30    continue
322         iend = kplusp
323   40    continue
324c
325         if (msglvl .gt. 2) then
326             call pivout (comm, logfil, 1, [istart], ndigit,
327     &                   '_napps: Start of current block ')
328             call pivout (comm, logfil, 1, [iend], ndigit,
329     &                   '_napps: End of current block ')
330         end if
331c
332c        %------------------------------------------------%
333c        | No reason to apply a shift to block of order 1 |
334c        | or if the current block starts after the point |
335c        | of compression since we'll discard this stuff  |
336c        %------------------------------------------------%
337c
338         if ( istart .eq. iend .or. istart .gt. kev) go to 100
339c
340         h11 = h(istart,istart)
341         h21 = h(istart+1,istart)
342         f = h11 - sigma
343         g = h21
344c
345         do 80 i = istart, iend-1
346c
347c           %------------------------------------------------------%
348c           | Construct the plane rotation G to zero out the bulge |
349c           %------------------------------------------------------%
350c
351            call zlartg (f, g, c, s, r)
352            if (i .gt. istart) then
353               h(i,i-1) = r
354               h(i+1,i-1) = zero
355            end if
356c
357c           %---------------------------------------------%
358c           | Apply rotation to the left of H;  H <- G'*H |
359c           %---------------------------------------------%
360c
361            do 50 j = i, kplusp
362               t        =  c*h(i,j) + s*h(i+1,j)
363               h(i+1,j) = -conjg(s)*h(i,j) + c*h(i+1,j)
364               h(i,j)   = t
365   50       continue
366c
367c           %---------------------------------------------%
368c           | Apply rotation to the right of H;  H <- H*G |
369c           %---------------------------------------------%
370c
371            do 60 j = 1, min(i+2,iend)
372               t        =  c*h(j,i) + conjg(s)*h(j,i+1)
373               h(j,i+1) = -s*h(j,i) + c*h(j,i+1)
374               h(j,i)   = t
375   60       continue
376c
377c           %-----------------------------------------------------%
378c           | Accumulate the rotation in the matrix Q;  Q <- Q*G' |
379c           %-----------------------------------------------------%
380c
381            do 70 j = 1, min(i+jj, kplusp)
382               t        =   c*q(j,i) + conjg(s)*q(j,i+1)
383               q(j,i+1) = - s*q(j,i) + c*q(j,i+1)
384               q(j,i)   = t
385   70       continue
386c
387c           %---------------------------%
388c           | Prepare for next rotation |
389c           %---------------------------%
390c
391            if (i .lt. iend-1) then
392               f = h(i+1,i)
393               g = h(i+2,i)
394            end if
395   80    continue
396c
397c        %-------------------------------%
398c        | Finished applying the shift.  |
399c        %-------------------------------%
400c
401  100    continue
402c
403c        %---------------------------------------------------------%
404c        | Apply the same shift to the next block if there is any. |
405c        %---------------------------------------------------------%
406c
407         istart = iend + 1
408         if (iend .lt. kplusp) go to 20
409c
410c        %---------------------------------------------%
411c        | Loop back to the top to get the next shift. |
412c        %---------------------------------------------%
413c
414  110 continue
415c
416c     %---------------------------------------------------%
417c     | Perform a similarity transformation that makes    |
418c     | sure that the compressed H will have non-negative |
419c     | real subdiagonal elements.                        |
420c     %---------------------------------------------------%
421c
422      do 120 j=1,kev
423         if ( dble( h(j+1,j) ) .lt. rzero .or.
424     &        dimag( h(j+1,j) ) .ne. rzero ) then
425            t = h(j+1,j) / dlapy2(dble(h(j+1,j)),dimag(h(j+1,j)))
426            call zscal( kplusp-j+1, conjg(t), h(j+1,j), ldh )
427            call zscal( min(j+2, kplusp), t, h(1,j+1), 1 )
428            call zscal( min(j+np+1,kplusp), t, q(1,j+1), 1 )
429            h(j+1,j) = dcmplx( dble( h(j+1,j) ), rzero )
430         end if
431  120 continue
432c
433      do 130 i = 1, kev
434c
435c        %--------------------------------------------%
436c        | Final check for splitting and deflation.   |
437c        | Use a standard test as in the QR algorithm |
438c        | REFERENCE: LAPACK subroutine zlahqr.       |
439c        | Note: Since the subdiagonals of the        |
440c        | compressed H are nonnegative real numbers, |
441c        | we take advantage of this.                 |
442c        %--------------------------------------------%
443c
444         tst1 = cabs1( h( i, i ) ) + cabs1( h( i+1, i+1 ) )
445         if( tst1 .eq. rzero )
446     &       tst1 = zlanhs( '1', kev, h, ldh, workl )
447         if( dble( h( i+1,i ) ) .le. max( ulp*tst1, smlnum ) )
448     &       h(i+1,i) = zero
449 130  continue
450c
451c     %-------------------------------------------------%
452c     | Compute the (kev+1)-st column of (V*Q) and      |
453c     | temporarily store the result in WORKD(N+1:2*N). |
454c     | This is needed in the residual update since we  |
455c     | cannot GUARANTEE that the corresponding entry   |
456c     | of H would be zero as in exact arithmetic.      |
457c     %-------------------------------------------------%
458c
459      if ( dble( h(kev+1,kev) ) .gt. rzero )
460     &   call zgemv ('N', n, kplusp, one, v, ldv, q(1,kev+1), 1, zero,
461     &                workd(n+1), 1)
462c
463c     %----------------------------------------------------------%
464c     | Compute column 1 to kev of (V*Q) in backward order       |
465c     | taking advantage of the upper Hessenberg structure of Q. |
466c     %----------------------------------------------------------%
467c
468      do 140 i = 1, kev
469         call zgemv ('N', n, kplusp-i+1, one, v, ldv,
470     &               q(1,kev-i+1), 1, zero, workd, 1)
471         call zcopy (n, workd, 1, v(1,kplusp-i+1), 1)
472  140 continue
473c
474c     %-------------------------------------------------%
475c     |  Move v(:,kplusp-kev+1:kplusp) into v(:,1:kev). |
476c     %-------------------------------------------------%
477c
478      call zlacpy ('A', n, kev, v(1,kplusp-kev+1), ldv, v, ldv)
479c
480c     %--------------------------------------------------------------%
481c     | Copy the (kev+1)-st column of (V*Q) in the appropriate place |
482c     %--------------------------------------------------------------%
483c
484      if ( dble( h(kev+1,kev) ) .gt. rzero )
485     &   call zcopy (n, workd(n+1), 1, v(1,kev+1), 1)
486c
487c     %-------------------------------------%
488c     | Update the residual vector:         |
489c     |    r <- sigmak*r + betak*v(:,kev+1) |
490c     | where                               |
491c     |    sigmak = (e_{kev+p}'*Q)*e_{kev}  |
492c     |    betak = e_{kev+1}'*H*e_{kev}     |
493c     %-------------------------------------%
494c
495      call zscal (n, q(kplusp,kev), resid, 1)
496      if ( dble( h(kev+1,kev) ) .gt. rzero )
497     &   call zaxpy (n, h(kev+1,kev), v(1,kev+1), 1, resid, 1)
498c
499      if (msglvl .gt. 1) then
500         call pzvout (comm, logfil, 1, q(kplusp,kev), ndigit,
501     &        '_napps: sigmak = (e_{kev+p}^T*Q)*e_{kev}')
502         call pzvout (comm, logfil, 1, h(kev+1,kev), ndigit,
503     &        '_napps: betak = e_{kev+1}^T*H*e_{kev}')
504         call pivout (comm, logfil, 1, [kev], ndigit,
505     &               '_napps: Order of the final Hessenberg matrix ')
506         if (msglvl .gt. 2) then
507            call pzmout (comm, logfil, kev, kev, h, ldh, ndigit,
508     &      '_napps: updated Hessenberg matrix H for next iteration')
509         end if
510c
511      end if
512c
513 9000 continue
514      call arscnd (t1)
515      tcapps = tcapps + (t1 - t0)
516c
517      return
518c
519c     %----------------%
520c     | End of pznapps |
521c     %----------------%
522c
523      end
524