1*> \brief \b CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T
2*
3*  =========== DOCUMENTATION ===========
4*
5* Online html documentation available at
6*            http://www.netlib.org/lapack/explore-html/
7*
8*> \htmlonly
9*> Download CHETRD_HB2ST + dependencies
10*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_hb2st.f">
11*> [TGZ]</a>
12*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_hb2st.f">
13*> [ZIP]</a>
14*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_hb2st.f">
15*> [TXT]</a>
16*> \endhtmlonly
17*
18*  Definition:
19*  ===========
20*
21*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
22*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
23*
24*       #if defined(_OPENMP)
25*       use omp_lib
26*       #endif
27*
28*       IMPLICIT NONE
29*
30*       .. Scalar Arguments ..
31*       CHARACTER          STAGE1, UPLO, VECT
32*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
33*       ..
34*       .. Array Arguments ..
35*       REAL               D( * ), E( * )
36*       COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )
37*       ..
38*
39*
40*> \par Purpose:
41*  =============
42*>
43*> \verbatim
44*>
45*> CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric
46*> tridiagonal form T by a unitary similarity transformation:
47*> Q**H * A * Q = T.
48*> \endverbatim
49*
50*  Arguments:
51*  ==========
52*
53*> \param[in] STAGE1
54*> \verbatim
55*>          STAGE1 is CHARACTER*1
56*>          = 'N':  "No": to mention that the stage 1 of the reduction
57*>                  from dense to band using the chetrd_he2hb routine
58*>                  was not called before this routine to reproduce AB.
59*>                  In other term this routine is called as standalone.
60*>          = 'Y':  "Yes": to mention that the stage 1 of the
61*>                  reduction from dense to band using the chetrd_he2hb
62*>                  routine has been called to produce AB (e.g., AB is
63*>                  the output of chetrd_he2hb.
64*> \endverbatim
65*>
66*> \param[in] VECT
67*> \verbatim
68*>          VECT is CHARACTER*1
69*>          = 'N':  No need for the Housholder representation,
70*>                  and thus LHOUS is of size max(1, 4*N);
71*>          = 'V':  the Householder representation is needed to
72*>                  either generate or to apply Q later on,
73*>                  then LHOUS is to be queried and computed.
74*>                  (NOT AVAILABLE IN THIS RELEASE).
75*> \endverbatim
76*>
77*> \param[in] UPLO
78*> \verbatim
79*>          UPLO is CHARACTER*1
80*>          = 'U':  Upper triangle of A is stored;
81*>          = 'L':  Lower triangle of A is stored.
82*> \endverbatim
83*>
84*> \param[in] N
85*> \verbatim
86*>          N is INTEGER
87*>          The order of the matrix A.  N >= 0.
88*> \endverbatim
89*>
90*> \param[in] KD
91*> \verbatim
92*>          KD is INTEGER
93*>          The number of superdiagonals of the matrix A if UPLO = 'U',
94*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
95*> \endverbatim
96*>
97*> \param[in,out] AB
98*> \verbatim
99*>          AB is COMPLEX array, dimension (LDAB,N)
100*>          On entry, the upper or lower triangle of the Hermitian band
101*>          matrix A, stored in the first KD+1 rows of the array.  The
102*>          j-th column of A is stored in the j-th column of the array AB
103*>          as follows:
104*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
105*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
106*>          On exit, the diagonal elements of AB are overwritten by the
107*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
108*>          elements on the first superdiagonal (if UPLO = 'U') or the
109*>          first subdiagonal (if UPLO = 'L') are overwritten by the
110*>          off-diagonal elements of T; the rest of AB is overwritten by
111*>          values generated during the reduction.
112*> \endverbatim
113*>
114*> \param[in] LDAB
115*> \verbatim
116*>          LDAB is INTEGER
117*>          The leading dimension of the array AB.  LDAB >= KD+1.
118*> \endverbatim
119*>
120*> \param[out] D
121*> \verbatim
122*>          D is REAL array, dimension (N)
123*>          The diagonal elements of the tridiagonal matrix T.
124*> \endverbatim
125*>
126*> \param[out] E
127*> \verbatim
128*>          E is REAL array, dimension (N-1)
129*>          The off-diagonal elements of the tridiagonal matrix T:
130*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
131*> \endverbatim
132*>
133*> \param[out] HOUS
134*> \verbatim
135*>          HOUS is COMPLEX array, dimension LHOUS, that
136*>          store the Householder representation.
137*> \endverbatim
138*>
139*> \param[in] LHOUS
140*> \verbatim
141*>          LHOUS is INTEGER
142*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
143*>          If LWORK = -1, or LHOUS=-1,
144*>          then a query is assumed; the routine
145*>          only calculates the optimal size of the HOUS array, returns
146*>          this value as the first entry of the HOUS array, and no error
147*>          message related to LHOUS is issued by XERBLA.
148*>          LHOUS = MAX(1, dimension) where
149*>          dimension = 4*N if VECT='N'
150*>          not available now if VECT='H'
151*> \endverbatim
152*>
153*> \param[out] WORK
154*> \verbatim
155*>          WORK is COMPLEX array, dimension LWORK.
156*> \endverbatim
157*>
158*> \param[in] LWORK
159*> \verbatim
160*>          LWORK is INTEGER
161*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
162*>          If LWORK = -1, or LHOUS=-1,
163*>          then a workspace query is assumed; the routine
164*>          only calculates the optimal size of the WORK array, returns
165*>          this value as the first entry of the WORK array, and no error
166*>          message related to LWORK is issued by XERBLA.
167*>          LWORK = MAX(1, dimension) where
168*>          dimension   = (2KD+1)*N + KD*NTHREADS
169*>          where KD is the blocking size of the reduction,
170*>          FACTOPTNB is the blocking used by the QR or LQ
171*>          algorithm, usually FACTOPTNB=128 is a good choice
172*>          NTHREADS is the number of threads used when
173*>          openMP compilation is enabled, otherwise =1.
174*> \endverbatim
175*>
176*> \param[out] INFO
177*> \verbatim
178*>          INFO is INTEGER
179*>          = 0:  successful exit
180*>          < 0:  if INFO = -i, the i-th argument had an illegal value
181*> \endverbatim
182*
183*  Authors:
184*  ========
185*
186*> \author Univ. of Tennessee
187*> \author Univ. of California Berkeley
188*> \author Univ. of Colorado Denver
189*> \author NAG Ltd.
190*
191*> \ingroup complexOTHERcomputational
192*
193*> \par Further Details:
194*  =====================
195*>
196*> \verbatim
197*>
198*>  Implemented by Azzam Haidar.
199*>
200*>  All details are available on technical report, SC11, SC13 papers.
201*>
202*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
203*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
204*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
205*>  of 2011 International Conference for High Performance Computing,
206*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
207*>  Article 8 , 11 pages.
208*>  http://doi.acm.org/10.1145/2063384.2063394
209*>
210*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
211*>  An improved parallel singular value algorithm and its implementation
212*>  for multicore hardware, In Proceedings of 2013 International Conference
213*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
214*>  Denver, Colorado, USA, 2013.
215*>  Article 90, 12 pages.
216*>  http://doi.acm.org/10.1145/2503210.2503292
217*>
218*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
219*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
220*>  calculations based on fine-grained memory aware tasks.
221*>  International Journal of High Performance Computing Applications.
222*>  Volume 28 Issue 2, Pages 196-209, May 2014.
223*>  http://hpc.sagepub.com/content/28/2/196
224*>
225*> \endverbatim
226*>
227*  =====================================================================
228      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
229     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
230*
231*
232#if defined(_OPENMP)
233      use omp_lib
234#endif
235*
236      IMPLICIT NONE
237*
238*  -- LAPACK computational routine --
239*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
240*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
241*
242*     .. Scalar Arguments ..
243      CHARACTER          STAGE1, UPLO, VECT
244      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
245*     ..
246*     .. Array Arguments ..
247      REAL               D( * ), E( * )
248      COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )
249*     ..
250*
251*  =====================================================================
252*
253*     .. Parameters ..
254      REAL               RZERO
255      COMPLEX            ZERO, ONE
256      PARAMETER          ( RZERO = 0.0E+0,
257     $                   ZERO = ( 0.0E+0, 0.0E+0 ),
258     $                   ONE  = ( 1.0E+0, 0.0E+0 ) )
259*     ..
260*     .. Local Scalars ..
261      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
262      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
263     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
264     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
265     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
266     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
267     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
268     $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
269      REAL               ABSTMP
270      COMPLEX            TMP
271*     ..
272*     .. External Subroutines ..
273      EXTERNAL           CHB2ST_KERNELS, CLACPY, CLASET, XERBLA
274*     ..
275*     .. Intrinsic Functions ..
276      INTRINSIC          MIN, MAX, CEILING, REAL
277*     ..
278*     .. External Functions ..
279      LOGICAL            LSAME
280      INTEGER            ILAENV2STAGE
281      EXTERNAL           LSAME, ILAENV2STAGE
282*     ..
283*     .. Executable Statements ..
284*
285*     Determine the minimal workspace size required.
286*     Test the input parameters
287*
288      DEBUG   = 0
289      INFO    = 0
290      AFTERS1 = LSAME( STAGE1, 'Y' )
291      WANTQ   = LSAME( VECT, 'V' )
292      UPPER   = LSAME( UPLO, 'U' )
293      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
294*
295*     Determine the block size, the workspace size and the hous size.
296*
297      IB     = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
298      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
299      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
300*
301      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
302         INFO = -1
303      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
304         INFO = -2
305      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
306         INFO = -3
307      ELSE IF( N.LT.0 ) THEN
308         INFO = -4
309      ELSE IF( KD.LT.0 ) THEN
310         INFO = -5
311      ELSE IF( LDAB.LT.(KD+1) ) THEN
312         INFO = -7
313      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
314         INFO = -11
315      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
316         INFO = -13
317      END IF
318*
319      IF( INFO.EQ.0 ) THEN
320         HOUS( 1 ) = LHMIN
321         WORK( 1 ) = LWMIN
322      END IF
323*
324      IF( INFO.NE.0 ) THEN
325         CALL XERBLA( 'CHETRD_HB2ST', -INFO )
326         RETURN
327      ELSE IF( LQUERY ) THEN
328         RETURN
329      END IF
330*
331*     Quick return if possible
332*
333      IF( N.EQ.0 ) THEN
334          HOUS( 1 ) = 1
335          WORK( 1 ) = 1
336          RETURN
337      END IF
338*
339*     Determine pointer position
340*
341      LDV      = KD + IB
342      SIZETAU  = 2 * N
343      SICEV    = 2 * N
344      INDTAU   = 1
345      INDV     = INDTAU + SIZETAU
346      LDA      = 2 * KD + 1
347      SIZEA    = LDA * N
348      INDA     = 1
349      INDW     = INDA + SIZEA
350      NTHREADS = 1
351      TID      = 0
352*
353      IF( UPPER ) THEN
354          APOS     = INDA + KD
355          AWPOS    = INDA
356          DPOS     = APOS + KD
357          OFDPOS   = DPOS - 1
358          ABDPOS   = KD + 1
359          ABOFDPOS = KD
360      ELSE
361          APOS     = INDA
362          AWPOS    = INDA + KD + 1
363          DPOS     = APOS
364          OFDPOS   = DPOS + 1
365          ABDPOS   = 1
366          ABOFDPOS = 2
367
368      ENDIF
369*
370*     Case KD=0:
371*     The matrix is diagonal. We just copy it (convert to "real" for
372*     complex because D is double and the imaginary part should be 0)
373*     and store it in D. A sequential code here is better or
374*     in a parallel environment it might need two cores for D and E
375*
376      IF( KD.EQ.0 ) THEN
377          DO 30 I = 1, N
378              D( I ) = REAL( AB( ABDPOS, I ) )
379   30     CONTINUE
380          DO 40 I = 1, N-1
381              E( I ) = RZERO
382   40     CONTINUE
383*
384          HOUS( 1 ) = 1
385          WORK( 1 ) = 1
386          RETURN
387      END IF
388*
389*     Case KD=1:
390*     The matrix is already Tridiagonal. We have to make diagonal
391*     and offdiagonal elements real, and store them in D and E.
392*     For that, for real precision just copy the diag and offdiag
393*     to D and E while for the COMPLEX case the bulge chasing is
394*     performed to convert the hermetian tridiagonal to symmetric
395*     tridiagonal. A simpler conversion formula might be used, but then
396*     updating the Q matrix will be required and based if Q is generated
397*     or not this might complicate the story.
398*
399      IF( KD.EQ.1 ) THEN
400          DO 50 I = 1, N
401              D( I ) = REAL( AB( ABDPOS, I ) )
402   50     CONTINUE
403*
404*         make off-diagonal elements real and copy them to E
405*
406          IF( UPPER ) THEN
407              DO 60 I = 1, N - 1
408                  TMP = AB( ABOFDPOS, I+1 )
409                  ABSTMP = ABS( TMP )
410                  AB( ABOFDPOS, I+1 ) = ABSTMP
411                  E( I ) = ABSTMP
412                  IF( ABSTMP.NE.RZERO ) THEN
413                     TMP = TMP / ABSTMP
414                  ELSE
415                     TMP = ONE
416                  END IF
417                  IF( I.LT.N-1 )
418     $               AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
419C                  IF( WANTZ ) THEN
420C                     CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )
421C                  END IF
422   60         CONTINUE
423          ELSE
424              DO 70 I = 1, N - 1
425                 TMP = AB( ABOFDPOS, I )
426                 ABSTMP = ABS( TMP )
427                 AB( ABOFDPOS, I ) = ABSTMP
428                 E( I ) = ABSTMP
429                 IF( ABSTMP.NE.RZERO ) THEN
430                    TMP = TMP / ABSTMP
431                 ELSE
432                    TMP = ONE
433                 END IF
434                 IF( I.LT.N-1 )
435     $              AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
436C                 IF( WANTQ ) THEN
437C                    CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )
438C                 END IF
439   70         CONTINUE
440          ENDIF
441*
442          HOUS( 1 ) = 1
443          WORK( 1 ) = 1
444          RETURN
445      END IF
446*
447*     Main code start here.
448*     Reduce the hermitian band of A to a tridiagonal matrix.
449*
450      THGRSIZ   = N
451      GRSIZ     = 1
452      SHIFT     = 3
453      NBTILES   = CEILING( REAL(N)/REAL(KD) )
454      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
455      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
456*
457      CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
458      CALL CLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
459*
460*
461*     openMP parallelisation start here
462*
463#if defined(_OPENMP)
464!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
465!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
466!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
467!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
468!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
469!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
470!$OMP MASTER
471#endif
472*
473*     main bulge chasing loop
474*
475      DO 100 THGRID = 1, THGRNB
476          STT  = (THGRID-1)*THGRSIZ+1
477          THED = MIN( (STT + THGRSIZ -1), (N-1))
478          DO 110 I = STT, N-1
479              ED = MIN( I, THED )
480              IF( STT.GT.ED ) EXIT
481              DO 120 M = 1, STEPERCOL
482                  ST = STT
483                  DO 130 SWEEPID = ST, ED
484                      DO 140 K = 1, GRSIZ
485                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
486     $                           + (M-1)*GRSIZ + K
487                          IF ( MYID.EQ.1 ) THEN
488                              TTYPE = 1
489                          ELSE
490                              TTYPE = MOD( MYID, 2 ) + 2
491                          ENDIF
492
493                          IF( TTYPE.EQ.2 ) THEN
494                              COLPT      = (MYID/2)*KD + SWEEPID
495                              STIND      = COLPT-KD+1
496                              EDIND      = MIN(COLPT,N)
497                              BLKLASTIND = COLPT
498                          ELSE
499                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
500                              STIND      = COLPT-KD+1
501                              EDIND      = MIN(COLPT,N)
502                              IF( ( STIND.GE.EDIND-1 ).AND.
503     $                            ( EDIND.EQ.N ) ) THEN
504                                  BLKLASTIND = N
505                              ELSE
506                                  BLKLASTIND = 0
507                              ENDIF
508                          ENDIF
509*
510*                         Call the kernel
511*
512#if defined(_OPENMP) && _OPENMP >= 201307
513                          IF( TTYPE.NE.1 ) THEN
514!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
515!$OMP$     DEPEND(in:WORK(MYID-1))
516!$OMP$     DEPEND(out:WORK(MYID))
517                              TID      = OMP_GET_THREAD_NUM()
518                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
519     $                             STIND, EDIND, SWEEPID, N, KD, IB,
520     $                             WORK ( INDA ), LDA,
521     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
522     $                             WORK( INDW + TID*KD ) )
523!$OMP END TASK
524                          ELSE
525!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
526!$OMP$     DEPEND(out:WORK(MYID))
527                              TID      = OMP_GET_THREAD_NUM()
528                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
529     $                             STIND, EDIND, SWEEPID, N, KD, IB,
530     $                             WORK ( INDA ), LDA,
531     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
532     $                             WORK( INDW + TID*KD ) )
533!$OMP END TASK
534                          ENDIF
535#else
536                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
537     $                         STIND, EDIND, SWEEPID, N, KD, IB,
538     $                         WORK ( INDA ), LDA,
539     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
540     $                         WORK( INDW + TID*KD ) )
541#endif
542                          IF ( BLKLASTIND.GE.(N-1) ) THEN
543                              STT = STT + 1
544                              EXIT
545                          ENDIF
546  140                 CONTINUE
547  130             CONTINUE
548  120         CONTINUE
549  110     CONTINUE
550  100 CONTINUE
551*
552#if defined(_OPENMP)
553!$OMP END MASTER
554!$OMP END PARALLEL
555#endif
556*
557*     Copy the diagonal from A to D. Note that D is REAL thus only
558*     the Real part is needed, the imaginary part should be zero.
559*
560      DO 150 I = 1, N
561          D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) )
562  150 CONTINUE
563*
564*     Copy the off diagonal from A to E. Note that E is REAL thus only
565*     the Real part is needed, the imaginary part should be zero.
566*
567      IF( UPPER ) THEN
568          DO 160 I = 1, N-1
569             E( I ) = REAL( WORK( OFDPOS+I*LDA ) )
570  160     CONTINUE
571      ELSE
572          DO 170 I = 1, N-1
573             E( I ) = REAL( WORK( OFDPOS+(I-1)*LDA ) )
574  170     CONTINUE
575      ENDIF
576*
577      HOUS( 1 ) = LHMIN
578      WORK( 1 ) = LWMIN
579      RETURN
580*
581*     End of CHETRD_HB2ST
582*
583      END
584
585