1c\BeginDoc 2c 3c\Name: pznapps 4c 5c Message Passing Layer: BLACS 6c 7c\Description: 8c Given the Arnoldi factorization 9c 10c A*V_{k} - V_{k}*H_{k} = r_{k+p}*e_{k+p}^T, 11c 12c apply NP implicit shifts resulting in 13c 14c A*(V_{k}*Q) - (V_{k}*Q)*(Q^T* H_{k}*Q) = r_{k+p}*e_{k+p}^T * Q 15c 16c where Q is an orthogonal matrix which is the product of rotations 17c and reflections resulting from the NP bulge change sweeps. 18c The updated Arnoldi factorization becomes: 19c 20c A*VNEW_{k} - VNEW_{k}*HNEW_{k} = rnew_{k}*e_{k}^T. 21c 22c\Usage: 23c call pznapps 24c ( COMM, N, KEV, NP, SHIFT, V, LDV, H, LDH, RESID, Q, LDQ, 25c WORKL, WORKD ) 26c 27c\Arguments 28c COMM BLACS Communicator for the processor grid. (INPUT) 29c 30c N Integer. (INPUT) 31c Problem size, i.e. size of matrix A. 32c 33c KEV Integer. (INPUT/OUTPUT) 34c KEV+NP is the size of the input matrix H. 35c KEV is the size of the updated matrix HNEW. 36c 37c NP Integer. (INPUT) 38c Number of implicit shifts to be applied. 39c 40c SHIFT Complex*16 array of length NP. (INPUT) 41c The shifts to be applied. 42c 43c V Complex*16 N by (KEV+NP) array. (INPUT/OUTPUT) 44c On INPUT, V contains the current KEV+NP Arnoldi vectors. 45c On OUTPUT, V contains the updated KEV Arnoldi vectors 46c in the first KEV columns of V. 47c 48c LDV Integer. (INPUT) 49c Leading dimension of V exactly as declared in the calling 50c program. 51c 52c H Complex*16 (KEV+NP) by (KEV+NP) array. (INPUT/OUTPUT) 53c On INPUT, H contains the current KEV+NP by KEV+NP upper 54c Hessenberg matrix of the Arnoldi factorization. 55c On OUTPUT, H contains the updated KEV by KEV upper Hessenberg 56c matrix in the KEV leading submatrix. 57c 58c LDH Integer. (INPUT) 59c Leading dimension of H exactly as declared in the calling 60c program. 61c 62c RESID Complex*16 array of length N. (INPUT/OUTPUT) 63c On INPUT, RESID contains the the residual vector r_{k+p}. 64c On OUTPUT, RESID is the update residual vector rnew_{k} 65c in the first KEV locations. 66c 67c Q Complex*16 KEV+NP by KEV+NP work array. (WORKSPACE) 68c Work array used to accumulate the rotations and reflections 69c during the bulge chase sweep. 70c 71c LDQ Integer. (INPUT) 72c Leading dimension of Q exactly as declared in the calling 73c program. 74c 75c WORKL Complex*16 work array of length (KEV+NP). (WORKSPACE) 76c Private (replicated) array on each PE or array allocated on 77c the front end. 78c 79c WORKD Complex*16 work array of length 2*N. (WORKSPACE) 80c Distributed array used in the application of the accumulated 81c orthogonal matrix Q. 82c 83c\EndDoc 84c 85c----------------------------------------------------------------------- 86c 87c\BeginLib 88c 89c\Local variables: 90c xxxxxx Complex*16 91c 92c\References: 93c 1. D.C. Sorensen, "Implicit Application of Polynomial Filters in 94c a k-Step Arnoldi Method", SIAM J. Matr. Anal. Apps., 13 (1992), 95c pp 357-385. 96c 97c\Routines called: 98c pivout Parallel ARPACK utility routine that prints integers. 99c arscnd ARPACK utility routine for timing. 100c pzmout Parallel ARPACK utility routine that prints matrices 101c pzvout Parallel ARPACK utility routine that prints vectors. 102c zlacpy LAPACK matrix copy routine. 103c zlanhs LAPACK routine that computes various norms of a matrix. 104c zlartg LAPACK Givens rotation construction routine. 105c zlaset LAPACK matrix initialization routine. 106c dlabad LAPACK routine for defining the underflow and overflow 107c limits. 108c pdlamch ScaLAPACK routine that determines machine constants. 109c dlapy2 LAPACK routine to compute sqrt(x**2+y**2) carefully. 110c zgemv Level 2 BLAS routine for matrix vector multiplication. 111c zaxpy Level 1 BLAS that computes a vector triad. 112c zcopy Level 1 BLAS that copies one vector to another. 113c zscal Level 1 BLAS that scales a vector. 114c 115c\Author 116c Danny Sorensen Phuong Vu 117c Richard Lehoucq CRPC / Rice University 118c Dept. of Computational & Houston, Texas 119c Applied Mathematics 120c Rice University 121c Houston, Texas 122c 123c\Parallel Modifications 124c Kristi Maschhoff 125c 126c\Revision history: 127c Starting Point: Serial Complex Code FILE: napps.F SID: 2.1 128c 129c\SCCS Information: 130c FILE: napps.F SID: 1.4 DATE OF SID: 10/25/03 131c 132c\Remarks 133c 1. In this version, each shift is applied to all the sublocks of 134c the Hessenberg matrix H and not just to the submatrix that it 135c comes from. Deflation as in LAPACK routine zlahqr (QR algorithm 136c for upper Hessenberg matrices ) is used. 137c Upon output, the subdiagonals of H are enforced to be non-negative 138c real numbers. 139c 140c\EndLib 141c 142c----------------------------------------------------------------------- 143c 144 subroutine pznapps 145 & ( comm, n, kev, np, shift, v, ldv, h, ldh, resid, q, ldq, 146 & workl, workd ) 147c 148c %--------------------% 149c | BLACS Communicator | 150c %--------------------% 151c 152 integer comm 153c 154c %----------------------------------------------------% 155c | Include files for debugging and timing information | 156c %----------------------------------------------------% 157c 158 include 'debug.h' 159 include 'stat.h' 160c 161c %------------------% 162c | Scalar Arguments | 163c %------------------% 164c 165 integer kev, ldh, ldq, ldv, n, np 166c 167c %-----------------% 168c | Array Arguments | 169c %-----------------% 170c 171 Complex*16 172 & h(ldh,kev+np), resid(n), shift(np), 173 & v(ldv,kev+np), q(ldq,kev+np), workd(2*n), workl(kev+np) 174c 175c %------------% 176c | Parameters | 177c %------------% 178c 179 Complex*16 180 & one, zero 181 Double precision 182 & rzero 183 parameter (one = (1.0, 0.0), zero = (0.0, 0.0), 184 & rzero = 0.0) 185c 186c %------------------------% 187c | Local Scalars & Arrays | 188c %------------------------% 189c 190 integer i, iend, istart, j, jj, kplusp, msglvl 191 logical first 192 Complex*16 193 & cdum, f, g, h11, h21, r, s, sigma, t 194 Double precision 195 & c, ovfl, smlnum, ulp, unfl, tst1 196 save first, ovfl, smlnum, ulp, unfl 197c 198c %----------------------% 199c | External Subroutines | 200c %----------------------% 201c 202 external zaxpy, zcopy, zgemv, zscal, zlacpy, zlartg, 203 & pzvout, zlaset, dlabad, pzmout, arscnd, pivout 204c 205c %--------------------% 206c | External Functions | 207c %--------------------% 208c 209 Double precision 210 & zlanhs, pdlamch, dlapy2 211 external zlanhs, pdlamch, dlapy2 212c 213c %----------------------% 214c | Intrinsics Functions | 215c %----------------------% 216c 217 intrinsic abs, dimag, conjg, dcmplx, max, min, dble 218c 219c %---------------------% 220c | Statement Functions | 221c %---------------------% 222c 223 Double precision 224 & cabs1 225 cabs1( cdum ) = abs( dble( cdum ) ) + abs( dimag( cdum ) ) 226c 227c %----------------% 228c | Data statements | 229c %----------------% 230c 231 data first / .true. / 232c 233c %-----------------------% 234c | Executable Statements | 235c %-----------------------% 236c 237 if (first) then 238c 239c %-----------------------------------------------% 240c | Set machine-dependent constants for the | 241c | stopping criterion. If norm(H) <= sqrt(OVFL), | 242c | overflow should not occur. | 243c | REFERENCE: LAPACK subroutine zlahqr | 244c %-----------------------------------------------% 245c 246 unfl = pdlamch( comm, 'safe minimum' ) 247 ovfl = dble(one / unfl) 248 call dlabad( unfl, ovfl ) 249 ulp = dlamch( 'precision' ) 250 smlnum = unfl*( n / ulp ) 251 first = .false. 252 end if 253c 254c %-------------------------------% 255c | Initialize timing statistics | 256c | & message level for debugging | 257c %-------------------------------% 258c 259 call arscnd (t0) 260 msglvl = mcapps 261c 262 kplusp = kev + np 263c 264c %--------------------------------------------% 265c | Initialize Q to the identity to accumulate | 266c | the rotations and reflections | 267c %--------------------------------------------% 268c 269 call zlaset ('All', kplusp, kplusp, zero, one, q, ldq) 270c 271c %----------------------------------------------% 272c | Quick return if there are no shifts to apply | 273c %----------------------------------------------% 274c 275 if (np .eq. 0) go to 9000 276c 277c %----------------------------------------------% 278c | Chase the bulge with the application of each | 279c | implicit shift. Each shift is applied to the | 280c | whole matrix including each block. | 281c %----------------------------------------------% 282c 283 do 110 jj = 1, np 284 sigma = shift(jj) 285c 286 if (msglvl .gt. 2 ) then 287 call pivout (comm, logfil, 1, [jj], ndigit, 288 & '_napps: shift number.') 289 call pzvout (comm, logfil, 1, [sigma], ndigit, 290 & '_napps: Value of the shift ') 291 end if 292c 293 istart = 1 294 20 continue 295c 296 do 30 i = istart, kplusp-1 297c 298c %----------------------------------------% 299c | Check for splitting and deflation. Use | 300c | a standard test as in the QR algorithm | 301c | REFERENCE: LAPACK subroutine zlahqr | 302c %----------------------------------------% 303c 304 tst1 = cabs1( h( i, i ) ) + cabs1( h( i+1, i+1 ) ) 305 if( tst1.eq.rzero ) 306 & tst1 = zlanhs( '1', kplusp-jj+1, h, ldh, workl ) 307 if ( abs(dble(h(i+1,i))) 308 & .le. max(ulp*tst1, smlnum) ) then 309 if (msglvl .gt. 0) then 310 call pivout (comm, logfil, 1, [i], ndigit, 311 & '_napps: matrix splitting at row/column no.') 312 call pivout (comm, logfil, 1, [jj], ndigit, 313 & '_napps: matrix splitting with shift number.') 314 call pzvout (comm, logfil, 1, h(i+1,i), ndigit, 315 & '_napps: off diagonal element.') 316 end if 317 iend = i 318 h(i+1,i) = zero 319 go to 40 320 end if 321 30 continue 322 iend = kplusp 323 40 continue 324c 325 if (msglvl .gt. 2) then 326 call pivout (comm, logfil, 1, [istart], ndigit, 327 & '_napps: Start of current block ') 328 call pivout (comm, logfil, 1, [iend], ndigit, 329 & '_napps: End of current block ') 330 end if 331c 332c %------------------------------------------------% 333c | No reason to apply a shift to block of order 1 | 334c | or if the current block starts after the point | 335c | of compression since we'll discard this stuff | 336c %------------------------------------------------% 337c 338 if ( istart .eq. iend .or. istart .gt. kev) go to 100 339c 340 h11 = h(istart,istart) 341 h21 = h(istart+1,istart) 342 f = h11 - sigma 343 g = h21 344c 345 do 80 i = istart, iend-1 346c 347c %------------------------------------------------------% 348c | Construct the plane rotation G to zero out the bulge | 349c %------------------------------------------------------% 350c 351 call zlartg (f, g, c, s, r) 352 if (i .gt. istart) then 353 h(i,i-1) = r 354 h(i+1,i-1) = zero 355 end if 356c 357c %---------------------------------------------% 358c | Apply rotation to the left of H; H <- G'*H | 359c %---------------------------------------------% 360c 361 do 50 j = i, kplusp 362 t = c*h(i,j) + s*h(i+1,j) 363 h(i+1,j) = -conjg(s)*h(i,j) + c*h(i+1,j) 364 h(i,j) = t 365 50 continue 366c 367c %---------------------------------------------% 368c | Apply rotation to the right of H; H <- H*G | 369c %---------------------------------------------% 370c 371 do 60 j = 1, min(i+2,iend) 372 t = c*h(j,i) + conjg(s)*h(j,i+1) 373 h(j,i+1) = -s*h(j,i) + c*h(j,i+1) 374 h(j,i) = t 375 60 continue 376c 377c %-----------------------------------------------------% 378c | Accumulate the rotation in the matrix Q; Q <- Q*G' | 379c %-----------------------------------------------------% 380c 381 do 70 j = 1, min(i+jj, kplusp) 382 t = c*q(j,i) + conjg(s)*q(j,i+1) 383 q(j,i+1) = - s*q(j,i) + c*q(j,i+1) 384 q(j,i) = t 385 70 continue 386c 387c %---------------------------% 388c | Prepare for next rotation | 389c %---------------------------% 390c 391 if (i .lt. iend-1) then 392 f = h(i+1,i) 393 g = h(i+2,i) 394 end if 395 80 continue 396c 397c %-------------------------------% 398c | Finished applying the shift. | 399c %-------------------------------% 400c 401 100 continue 402c 403c %---------------------------------------------------------% 404c | Apply the same shift to the next block if there is any. | 405c %---------------------------------------------------------% 406c 407 istart = iend + 1 408 if (iend .lt. kplusp) go to 20 409c 410c %---------------------------------------------% 411c | Loop back to the top to get the next shift. | 412c %---------------------------------------------% 413c 414 110 continue 415c 416c %---------------------------------------------------% 417c | Perform a similarity transformation that makes | 418c | sure that the compressed H will have non-negative | 419c | real subdiagonal elements. | 420c %---------------------------------------------------% 421c 422 do 120 j=1,kev 423 if ( dble( h(j+1,j) ) .lt. rzero .or. 424 & dimag( h(j+1,j) ) .ne. rzero ) then 425 t = h(j+1,j) / dlapy2(dble(h(j+1,j)),dimag(h(j+1,j))) 426 call zscal( kplusp-j+1, conjg(t), h(j+1,j), ldh ) 427 call zscal( min(j+2, kplusp), t, h(1,j+1), 1 ) 428 call zscal( min(j+np+1,kplusp), t, q(1,j+1), 1 ) 429 h(j+1,j) = dcmplx( dble( h(j+1,j) ), rzero ) 430 end if 431 120 continue 432c 433 do 130 i = 1, kev 434c 435c %--------------------------------------------% 436c | Final check for splitting and deflation. | 437c | Use a standard test as in the QR algorithm | 438c | REFERENCE: LAPACK subroutine zlahqr. | 439c | Note: Since the subdiagonals of the | 440c | compressed H are nonnegative real numbers, | 441c | we take advantage of this. | 442c %--------------------------------------------% 443c 444 tst1 = cabs1( h( i, i ) ) + cabs1( h( i+1, i+1 ) ) 445 if( tst1 .eq. rzero ) 446 & tst1 = zlanhs( '1', kev, h, ldh, workl ) 447 if( dble( h( i+1,i ) ) .le. max( ulp*tst1, smlnum ) ) 448 & h(i+1,i) = zero 449 130 continue 450c 451c %-------------------------------------------------% 452c | Compute the (kev+1)-st column of (V*Q) and | 453c | temporarily store the result in WORKD(N+1:2*N). | 454c | This is needed in the residual update since we | 455c | cannot GUARANTEE that the corresponding entry | 456c | of H would be zero as in exact arithmetic. | 457c %-------------------------------------------------% 458c 459 if ( dble( h(kev+1,kev) ) .gt. rzero ) 460 & call zgemv ('N', n, kplusp, one, v, ldv, q(1,kev+1), 1, zero, 461 & workd(n+1), 1) 462c 463c %----------------------------------------------------------% 464c | Compute column 1 to kev of (V*Q) in backward order | 465c | taking advantage of the upper Hessenberg structure of Q. | 466c %----------------------------------------------------------% 467c 468 do 140 i = 1, kev 469 call zgemv ('N', n, kplusp-i+1, one, v, ldv, 470 & q(1,kev-i+1), 1, zero, workd, 1) 471 call zcopy (n, workd, 1, v(1,kplusp-i+1), 1) 472 140 continue 473c 474c %-------------------------------------------------% 475c | Move v(:,kplusp-kev+1:kplusp) into v(:,1:kev). | 476c %-------------------------------------------------% 477c 478 call zlacpy ('A', n, kev, v(1,kplusp-kev+1), ldv, v, ldv) 479c 480c %--------------------------------------------------------------% 481c | Copy the (kev+1)-st column of (V*Q) in the appropriate place | 482c %--------------------------------------------------------------% 483c 484 if ( dble( h(kev+1,kev) ) .gt. rzero ) 485 & call zcopy (n, workd(n+1), 1, v(1,kev+1), 1) 486c 487c %-------------------------------------% 488c | Update the residual vector: | 489c | r <- sigmak*r + betak*v(:,kev+1) | 490c | where | 491c | sigmak = (e_{kev+p}'*Q)*e_{kev} | 492c | betak = e_{kev+1}'*H*e_{kev} | 493c %-------------------------------------% 494c 495 call zscal (n, q(kplusp,kev), resid, 1) 496 if ( dble( h(kev+1,kev) ) .gt. rzero ) 497 & call zaxpy (n, h(kev+1,kev), v(1,kev+1), 1, resid, 1) 498c 499 if (msglvl .gt. 1) then 500 call pzvout (comm, logfil, 1, q(kplusp,kev), ndigit, 501 & '_napps: sigmak = (e_{kev+p}^T*Q)*e_{kev}') 502 call pzvout (comm, logfil, 1, h(kev+1,kev), ndigit, 503 & '_napps: betak = e_{kev+1}^T*H*e_{kev}') 504 call pivout (comm, logfil, 1, [kev], ndigit, 505 & '_napps: Order of the final Hessenberg matrix ') 506 if (msglvl .gt. 2) then 507 call pzmout (comm, logfil, kev, kev, h, ldh, ndigit, 508 & '_napps: updated Hessenberg matrix H for next iteration') 509 end if 510c 511 end if 512c 513 9000 continue 514 call arscnd (t1) 515 tcapps = tcapps + (t1 - t0) 516c 517 return 518c 519c %----------------% 520c | End of pznapps | 521c %----------------% 522c 523 end 524