1*> \brief \b CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T 2* 3* =========== DOCUMENTATION =========== 4* 5* Online html documentation available at 6* http://www.netlib.org/lapack/explore-html/ 7* 8*> \htmlonly 9*> Download CHETRD_HB2ST + dependencies 10*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_hb2st.f"> 11*> [TGZ]</a> 12*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_hb2st.f"> 13*> [ZIP]</a> 14*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_hb2st.f"> 15*> [TXT]</a> 16*> \endhtmlonly 17* 18* Definition: 19* =========== 20* 21* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 22* D, E, HOUS, LHOUS, WORK, LWORK, INFO ) 23* 24* #if defined(_OPENMP) 25* use omp_lib 26* #endif 27* 28* IMPLICIT NONE 29* 30* .. Scalar Arguments .. 31* CHARACTER STAGE1, UPLO, VECT 32* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO 33* .. 34* .. Array Arguments .. 35* REAL D( * ), E( * ) 36* COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * ) 37* .. 38* 39* 40*> \par Purpose: 41* ============= 42*> 43*> \verbatim 44*> 45*> CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric 46*> tridiagonal form T by a unitary similarity transformation: 47*> Q**H * A * Q = T. 48*> \endverbatim 49* 50* Arguments: 51* ========== 52* 53*> \param[in] STAGE1 54*> \verbatim 55*> STAGE1 is CHARACTER*1 56*> = 'N': "No": to mention that the stage 1 of the reduction 57*> from dense to band using the chetrd_he2hb routine 58*> was not called before this routine to reproduce AB. 59*> In other term this routine is called as standalone. 60*> = 'Y': "Yes": to mention that the stage 1 of the 61*> reduction from dense to band using the chetrd_he2hb 62*> routine has been called to produce AB (e.g., AB is 63*> the output of chetrd_he2hb. 64*> \endverbatim 65*> 66*> \param[in] VECT 67*> \verbatim 68*> VECT is CHARACTER*1 69*> = 'N': No need for the Housholder representation, 70*> and thus LHOUS is of size max(1, 4*N); 71*> = 'V': the Householder representation is needed to 72*> either generate or to apply Q later on, 73*> then LHOUS is to be queried and computed. 74*> (NOT AVAILABLE IN THIS RELEASE). 75*> \endverbatim 76*> 77*> \param[in] UPLO 78*> \verbatim 79*> UPLO is CHARACTER*1 80*> = 'U': Upper triangle of A is stored; 81*> = 'L': Lower triangle of A is stored. 82*> \endverbatim 83*> 84*> \param[in] N 85*> \verbatim 86*> N is INTEGER 87*> The order of the matrix A. N >= 0. 88*> \endverbatim 89*> 90*> \param[in] KD 91*> \verbatim 92*> KD is INTEGER 93*> The number of superdiagonals of the matrix A if UPLO = 'U', 94*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. 95*> \endverbatim 96*> 97*> \param[in,out] AB 98*> \verbatim 99*> AB is COMPLEX array, dimension (LDAB,N) 100*> On entry, the upper or lower triangle of the Hermitian band 101*> matrix A, stored in the first KD+1 rows of the array. The 102*> j-th column of A is stored in the j-th column of the array AB 103*> as follows: 104*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; 105*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). 106*> On exit, the diagonal elements of AB are overwritten by the 107*> diagonal elements of the tridiagonal matrix T; if KD > 0, the 108*> elements on the first superdiagonal (if UPLO = 'U') or the 109*> first subdiagonal (if UPLO = 'L') are overwritten by the 110*> off-diagonal elements of T; the rest of AB is overwritten by 111*> values generated during the reduction. 112*> \endverbatim 113*> 114*> \param[in] LDAB 115*> \verbatim 116*> LDAB is INTEGER 117*> The leading dimension of the array AB. LDAB >= KD+1. 118*> \endverbatim 119*> 120*> \param[out] D 121*> \verbatim 122*> D is REAL array, dimension (N) 123*> The diagonal elements of the tridiagonal matrix T. 124*> \endverbatim 125*> 126*> \param[out] E 127*> \verbatim 128*> E is REAL array, dimension (N-1) 129*> The off-diagonal elements of the tridiagonal matrix T: 130*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. 131*> \endverbatim 132*> 133*> \param[out] HOUS 134*> \verbatim 135*> HOUS is COMPLEX array, dimension LHOUS, that 136*> store the Householder representation. 137*> \endverbatim 138*> 139*> \param[in] LHOUS 140*> \verbatim 141*> LHOUS is INTEGER 142*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) 143*> If LWORK = -1, or LHOUS=-1, 144*> then a query is assumed; the routine 145*> only calculates the optimal size of the HOUS array, returns 146*> this value as the first entry of the HOUS array, and no error 147*> message related to LHOUS is issued by XERBLA. 148*> LHOUS = MAX(1, dimension) where 149*> dimension = 4*N if VECT='N' 150*> not available now if VECT='H' 151*> \endverbatim 152*> 153*> \param[out] WORK 154*> \verbatim 155*> WORK is COMPLEX array, dimension LWORK. 156*> \endverbatim 157*> 158*> \param[in] LWORK 159*> \verbatim 160*> LWORK is INTEGER 161*> The dimension of the array WORK. LWORK = MAX(1, dimension) 162*> If LWORK = -1, or LHOUS=-1, 163*> then a workspace query is assumed; the routine 164*> only calculates the optimal size of the WORK array, returns 165*> this value as the first entry of the WORK array, and no error 166*> message related to LWORK is issued by XERBLA. 167*> LWORK = MAX(1, dimension) where 168*> dimension = (2KD+1)*N + KD*NTHREADS 169*> where KD is the blocking size of the reduction, 170*> FACTOPTNB is the blocking used by the QR or LQ 171*> algorithm, usually FACTOPTNB=128 is a good choice 172*> NTHREADS is the number of threads used when 173*> openMP compilation is enabled, otherwise =1. 174*> \endverbatim 175*> 176*> \param[out] INFO 177*> \verbatim 178*> INFO is INTEGER 179*> = 0: successful exit 180*> < 0: if INFO = -i, the i-th argument had an illegal value 181*> \endverbatim 182* 183* Authors: 184* ======== 185* 186*> \author Univ. of Tennessee 187*> \author Univ. of California Berkeley 188*> \author Univ. of Colorado Denver 189*> \author NAG Ltd. 190* 191*> \ingroup complexOTHERcomputational 192* 193*> \par Further Details: 194* ===================== 195*> 196*> \verbatim 197*> 198*> Implemented by Azzam Haidar. 199*> 200*> All details are available on technical report, SC11, SC13 papers. 201*> 202*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. 203*> Parallel reduction to condensed forms for symmetric eigenvalue problems 204*> using aggregated fine-grained and memory-aware kernels. In Proceedings 205*> of 2011 International Conference for High Performance Computing, 206*> Networking, Storage and Analysis (SC '11), New York, NY, USA, 207*> Article 8 , 11 pages. 208*> http://doi.acm.org/10.1145/2063384.2063394 209*> 210*> A. Haidar, J. Kurzak, P. Luszczek, 2013. 211*> An improved parallel singular value algorithm and its implementation 212*> for multicore hardware, In Proceedings of 2013 International Conference 213*> for High Performance Computing, Networking, Storage and Analysis (SC '13). 214*> Denver, Colorado, USA, 2013. 215*> Article 90, 12 pages. 216*> http://doi.acm.org/10.1145/2503210.2503292 217*> 218*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. 219*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure 220*> calculations based on fine-grained memory aware tasks. 221*> International Journal of High Performance Computing Applications. 222*> Volume 28 Issue 2, Pages 196-209, May 2014. 223*> http://hpc.sagepub.com/content/28/2/196 224*> 225*> \endverbatim 226*> 227* ===================================================================== 228 SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 229 $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) 230* 231* 232#if defined(_OPENMP) 233 use omp_lib 234#endif 235* 236 IMPLICIT NONE 237* 238* -- LAPACK computational routine -- 239* -- LAPACK is a software package provided by Univ. of Tennessee, -- 240* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- 241* 242* .. Scalar Arguments .. 243 CHARACTER STAGE1, UPLO, VECT 244 INTEGER N, KD, LDAB, LHOUS, LWORK, INFO 245* .. 246* .. Array Arguments .. 247 REAL D( * ), E( * ) 248 COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * ) 249* .. 250* 251* ===================================================================== 252* 253* .. Parameters .. 254 REAL RZERO 255 COMPLEX ZERO, ONE 256 PARAMETER ( RZERO = 0.0E+0, 257 $ ZERO = ( 0.0E+0, 0.0E+0 ), 258 $ ONE = ( 1.0E+0, 0.0E+0 ) ) 259* .. 260* .. Local Scalars .. 261 LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 262 INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 263 $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, 264 $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, 265 $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, 266 $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 267 $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, 268 $ SICEV, SIZETAU, LDV, LHMIN, LWMIN 269 REAL ABSTMP 270 COMPLEX TMP 271* .. 272* .. External Subroutines .. 273 EXTERNAL CHB2ST_KERNELS, CLACPY, CLASET, XERBLA 274* .. 275* .. Intrinsic Functions .. 276 INTRINSIC MIN, MAX, CEILING, REAL 277* .. 278* .. External Functions .. 279 LOGICAL LSAME 280 INTEGER ILAENV2STAGE 281 EXTERNAL LSAME, ILAENV2STAGE 282* .. 283* .. Executable Statements .. 284* 285* Determine the minimal workspace size required. 286* Test the input parameters 287* 288 DEBUG = 0 289 INFO = 0 290 AFTERS1 = LSAME( STAGE1, 'Y' ) 291 WANTQ = LSAME( VECT, 'V' ) 292 UPPER = LSAME( UPLO, 'U' ) 293 LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 ) 294* 295* Determine the block size, the workspace size and the hous size. 296* 297 IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) 298 LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) 299 LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) 300* 301 IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN 302 INFO = -1 303 ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN 304 INFO = -2 305 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN 306 INFO = -3 307 ELSE IF( N.LT.0 ) THEN 308 INFO = -4 309 ELSE IF( KD.LT.0 ) THEN 310 INFO = -5 311 ELSE IF( LDAB.LT.(KD+1) ) THEN 312 INFO = -7 313 ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN 314 INFO = -11 315 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN 316 INFO = -13 317 END IF 318* 319 IF( INFO.EQ.0 ) THEN 320 HOUS( 1 ) = LHMIN 321 WORK( 1 ) = LWMIN 322 END IF 323* 324 IF( INFO.NE.0 ) THEN 325 CALL XERBLA( 'CHETRD_HB2ST', -INFO ) 326 RETURN 327 ELSE IF( LQUERY ) THEN 328 RETURN 329 END IF 330* 331* Quick return if possible 332* 333 IF( N.EQ.0 ) THEN 334 HOUS( 1 ) = 1 335 WORK( 1 ) = 1 336 RETURN 337 END IF 338* 339* Determine pointer position 340* 341 LDV = KD + IB 342 SIZETAU = 2 * N 343 SICEV = 2 * N 344 INDTAU = 1 345 INDV = INDTAU + SIZETAU 346 LDA = 2 * KD + 1 347 SIZEA = LDA * N 348 INDA = 1 349 INDW = INDA + SIZEA 350 NTHREADS = 1 351 TID = 0 352* 353 IF( UPPER ) THEN 354 APOS = INDA + KD 355 AWPOS = INDA 356 DPOS = APOS + KD 357 OFDPOS = DPOS - 1 358 ABDPOS = KD + 1 359 ABOFDPOS = KD 360 ELSE 361 APOS = INDA 362 AWPOS = INDA + KD + 1 363 DPOS = APOS 364 OFDPOS = DPOS + 1 365 ABDPOS = 1 366 ABOFDPOS = 2 367 368 ENDIF 369* 370* Case KD=0: 371* The matrix is diagonal. We just copy it (convert to "real" for 372* complex because D is double and the imaginary part should be 0) 373* and store it in D. A sequential code here is better or 374* in a parallel environment it might need two cores for D and E 375* 376 IF( KD.EQ.0 ) THEN 377 DO 30 I = 1, N 378 D( I ) = REAL( AB( ABDPOS, I ) ) 379 30 CONTINUE 380 DO 40 I = 1, N-1 381 E( I ) = RZERO 382 40 CONTINUE 383* 384 HOUS( 1 ) = 1 385 WORK( 1 ) = 1 386 RETURN 387 END IF 388* 389* Case KD=1: 390* The matrix is already Tridiagonal. We have to make diagonal 391* and offdiagonal elements real, and store them in D and E. 392* For that, for real precision just copy the diag and offdiag 393* to D and E while for the COMPLEX case the bulge chasing is 394* performed to convert the hermetian tridiagonal to symmetric 395* tridiagonal. A simpler conversion formula might be used, but then 396* updating the Q matrix will be required and based if Q is generated 397* or not this might complicate the story. 398* 399 IF( KD.EQ.1 ) THEN 400 DO 50 I = 1, N 401 D( I ) = REAL( AB( ABDPOS, I ) ) 402 50 CONTINUE 403* 404* make off-diagonal elements real and copy them to E 405* 406 IF( UPPER ) THEN 407 DO 60 I = 1, N - 1 408 TMP = AB( ABOFDPOS, I+1 ) 409 ABSTMP = ABS( TMP ) 410 AB( ABOFDPOS, I+1 ) = ABSTMP 411 E( I ) = ABSTMP 412 IF( ABSTMP.NE.RZERO ) THEN 413 TMP = TMP / ABSTMP 414 ELSE 415 TMP = ONE 416 END IF 417 IF( I.LT.N-1 ) 418 $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP 419C IF( WANTZ ) THEN 420C CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 ) 421C END IF 422 60 CONTINUE 423 ELSE 424 DO 70 I = 1, N - 1 425 TMP = AB( ABOFDPOS, I ) 426 ABSTMP = ABS( TMP ) 427 AB( ABOFDPOS, I ) = ABSTMP 428 E( I ) = ABSTMP 429 IF( ABSTMP.NE.RZERO ) THEN 430 TMP = TMP / ABSTMP 431 ELSE 432 TMP = ONE 433 END IF 434 IF( I.LT.N-1 ) 435 $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP 436C IF( WANTQ ) THEN 437C CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 ) 438C END IF 439 70 CONTINUE 440 ENDIF 441* 442 HOUS( 1 ) = 1 443 WORK( 1 ) = 1 444 RETURN 445 END IF 446* 447* Main code start here. 448* Reduce the hermitian band of A to a tridiagonal matrix. 449* 450 THGRSIZ = N 451 GRSIZ = 1 452 SHIFT = 3 453 NBTILES = CEILING( REAL(N)/REAL(KD) ) 454 STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) 455 THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) 456* 457 CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) 458 CALL CLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) 459* 460* 461* openMP parallelisation start here 462* 463#if defined(_OPENMP) 464!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) 465!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 466!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) 467!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) 468!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) 469!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT ) 470!$OMP MASTER 471#endif 472* 473* main bulge chasing loop 474* 475 DO 100 THGRID = 1, THGRNB 476 STT = (THGRID-1)*THGRSIZ+1 477 THED = MIN( (STT + THGRSIZ -1), (N-1)) 478 DO 110 I = STT, N-1 479 ED = MIN( I, THED ) 480 IF( STT.GT.ED ) EXIT 481 DO 120 M = 1, STEPERCOL 482 ST = STT 483 DO 130 SWEEPID = ST, ED 484 DO 140 K = 1, GRSIZ 485 MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) 486 $ + (M-1)*GRSIZ + K 487 IF ( MYID.EQ.1 ) THEN 488 TTYPE = 1 489 ELSE 490 TTYPE = MOD( MYID, 2 ) + 2 491 ENDIF 492 493 IF( TTYPE.EQ.2 ) THEN 494 COLPT = (MYID/2)*KD + SWEEPID 495 STIND = COLPT-KD+1 496 EDIND = MIN(COLPT,N) 497 BLKLASTIND = COLPT 498 ELSE 499 COLPT = ((MYID+1)/2)*KD + SWEEPID 500 STIND = COLPT-KD+1 501 EDIND = MIN(COLPT,N) 502 IF( ( STIND.GE.EDIND-1 ).AND. 503 $ ( EDIND.EQ.N ) ) THEN 504 BLKLASTIND = N 505 ELSE 506 BLKLASTIND = 0 507 ENDIF 508 ENDIF 509* 510* Call the kernel 511* 512#if defined(_OPENMP) && _OPENMP >= 201307 513 IF( TTYPE.NE.1 ) THEN 514!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) 515!$OMP$ DEPEND(in:WORK(MYID-1)) 516!$OMP$ DEPEND(out:WORK(MYID)) 517 TID = OMP_GET_THREAD_NUM() 518 CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 519 $ STIND, EDIND, SWEEPID, N, KD, IB, 520 $ WORK ( INDA ), LDA, 521 $ HOUS( INDV ), HOUS( INDTAU ), LDV, 522 $ WORK( INDW + TID*KD ) ) 523!$OMP END TASK 524 ELSE 525!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) 526!$OMP$ DEPEND(out:WORK(MYID)) 527 TID = OMP_GET_THREAD_NUM() 528 CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 529 $ STIND, EDIND, SWEEPID, N, KD, IB, 530 $ WORK ( INDA ), LDA, 531 $ HOUS( INDV ), HOUS( INDTAU ), LDV, 532 $ WORK( INDW + TID*KD ) ) 533!$OMP END TASK 534 ENDIF 535#else 536 CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 537 $ STIND, EDIND, SWEEPID, N, KD, IB, 538 $ WORK ( INDA ), LDA, 539 $ HOUS( INDV ), HOUS( INDTAU ), LDV, 540 $ WORK( INDW + TID*KD ) ) 541#endif 542 IF ( BLKLASTIND.GE.(N-1) ) THEN 543 STT = STT + 1 544 EXIT 545 ENDIF 546 140 CONTINUE 547 130 CONTINUE 548 120 CONTINUE 549 110 CONTINUE 550 100 CONTINUE 551* 552#if defined(_OPENMP) 553!$OMP END MASTER 554!$OMP END PARALLEL 555#endif 556* 557* Copy the diagonal from A to D. Note that D is REAL thus only 558* the Real part is needed, the imaginary part should be zero. 559* 560 DO 150 I = 1, N 561 D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) ) 562 150 CONTINUE 563* 564* Copy the off diagonal from A to E. Note that E is REAL thus only 565* the Real part is needed, the imaginary part should be zero. 566* 567 IF( UPPER ) THEN 568 DO 160 I = 1, N-1 569 E( I ) = REAL( WORK( OFDPOS+I*LDA ) ) 570 160 CONTINUE 571 ELSE 572 DO 170 I = 1, N-1 573 E( I ) = REAL( WORK( OFDPOS+(I-1)*LDA ) ) 574 170 CONTINUE 575 ENDIF 576* 577 HOUS( 1 ) = LHMIN 578 WORK( 1 ) = LWMIN 579 RETURN 580* 581* End of CHETRD_HB2ST 582* 583 END 584 585