1*> \brief \b SSYTRD_SB2ST reduces a real symmetric band matrix A to real symmetric tridiagonal form T
2*
3*  =========== DOCUMENTATION ===========
4*
5* Online html documentation available at
6*            http://www.netlib.org/lapack/explore-html/
7*
8*> \htmlonly
9*> Download SSYTRD_SB2ST + dependencies
10*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_sb2t.f">
11*> [TGZ]</a>
12*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_sb2t.f">
13*> [ZIP]</a>
14*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_sb2t.f">
15*> [TXT]</a>
16*> \endhtmlonly
17*
18*  Definition:
19*  ===========
20*
21*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
22*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
23*
24*       #if defined(_OPENMP)
25*       use omp_lib
26*       #endif
27*
28*       IMPLICIT NONE
29*
30*       .. Scalar Arguments ..
31*       CHARACTER          STAGE1, UPLO, VECT
32*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
33*       ..
34*       .. Array Arguments ..
35*       REAL               D( * ), E( * )
36*       REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
37*       ..
38*
39*
40*> \par Purpose:
41*  =============
42*>
43*> \verbatim
44*>
45*> SSYTRD_SB2ST reduces a real symmetric band matrix A to real symmetric
46*> tridiagonal form T by a orthogonal similarity transformation:
47*> Q**T * A * Q = T.
48*> \endverbatim
49*
50*  Arguments:
51*  ==========
52*
53*> \param[in] STAGE1
54*> \verbatim
55*>          STAGE1 is CHARACTER*1
56*>          = 'N':  "No": to mention that the stage 1 of the reduction
57*>                  from dense to band using the ssytrd_sy2sb routine
58*>                  was not called before this routine to reproduce AB.
59*>                  In other term this routine is called as standalone.
60*>          = 'Y':  "Yes": to mention that the stage 1 of the
61*>                  reduction from dense to band using the ssytrd_sy2sb
62*>                  routine has been called to produce AB (e.g., AB is
63*>                  the output of ssytrd_sy2sb.
64*> \endverbatim
65*>
66*> \param[in] VECT
67*> \verbatim
68*>          VECT is CHARACTER*1
69*>          = 'N':  No need for the Housholder representation,
70*>                  and thus LHOUS is of size max(1, 4*N);
71*>          = 'V':  the Householder representation is needed to
72*>                  either generate or to apply Q later on,
73*>                  then LHOUS is to be queried and computed.
74*>                  (NOT AVAILABLE IN THIS RELEASE).
75*> \endverbatim
76*>
77*> \param[in] UPLO
78*> \verbatim
79*>          UPLO is CHARACTER*1
80*>          = 'U':  Upper triangle of A is stored;
81*>          = 'L':  Lower triangle of A is stored.
82*> \endverbatim
83*>
84*> \param[in] N
85*> \verbatim
86*>          N is INTEGER
87*>          The order of the matrix A.  N >= 0.
88*> \endverbatim
89*>
90*> \param[in] KD
91*> \verbatim
92*>          KD is INTEGER
93*>          The number of superdiagonals of the matrix A if UPLO = 'U',
94*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
95*> \endverbatim
96*>
97*> \param[in,out] AB
98*> \verbatim
99*>          AB is REAL array, dimension (LDAB,N)
100*>          On entry, the upper or lower triangle of the symmetric band
101*>          matrix A, stored in the first KD+1 rows of the array.  The
102*>          j-th column of A is stored in the j-th column of the array AB
103*>          as follows:
104*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
105*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
106*>          On exit, the diagonal elements of AB are overwritten by the
107*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
108*>          elements on the first superdiagonal (if UPLO = 'U') or the
109*>          first subdiagonal (if UPLO = 'L') are overwritten by the
110*>          off-diagonal elements of T; the rest of AB is overwritten by
111*>          values generated during the reduction.
112*> \endverbatim
113*>
114*> \param[in] LDAB
115*> \verbatim
116*>          LDAB is INTEGER
117*>          The leading dimension of the array AB.  LDAB >= KD+1.
118*> \endverbatim
119*>
120*> \param[out] D
121*> \verbatim
122*>          D is REAL array, dimension (N)
123*>          The diagonal elements of the tridiagonal matrix T.
124*> \endverbatim
125*>
126*> \param[out] E
127*> \verbatim
128*>          E is REAL array, dimension (N-1)
129*>          The off-diagonal elements of the tridiagonal matrix T:
130*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
131*> \endverbatim
132*>
133*> \param[out] HOUS
134*> \verbatim
135*>          HOUS is REAL array, dimension LHOUS, that
136*>          store the Householder representation.
137*> \endverbatim
138*>
139*> \param[in] LHOUS
140*> \verbatim
141*>          LHOUS is INTEGER
142*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
143*>          If LWORK = -1, or LHOUS=-1,
144*>          then a query is assumed; the routine
145*>          only calculates the optimal size of the HOUS array, returns
146*>          this value as the first entry of the HOUS array, and no error
147*>          message related to LHOUS is issued by XERBLA.
148*>          LHOUS = MAX(1, dimension) where
149*>          dimension = 4*N if VECT='N'
150*>          not available now if VECT='H'
151*> \endverbatim
152*>
153*> \param[out] WORK
154*> \verbatim
155*>          WORK is REAL array, dimension LWORK.
156*> \endverbatim
157*>
158*> \param[in] LWORK
159*> \verbatim
160*>          LWORK is INTEGER
161*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
162*>          If LWORK = -1, or LHOUS=-1,
163*>          then a workspace query is assumed; the routine
164*>          only calculates the optimal size of the WORK array, returns
165*>          this value as the first entry of the WORK array, and no error
166*>          message related to LWORK is issued by XERBLA.
167*>          LWORK = MAX(1, dimension) where
168*>          dimension   = (2KD+1)*N + KD*NTHREADS
169*>          where KD is the blocking size of the reduction,
170*>          FACTOPTNB is the blocking used by the QR or LQ
171*>          algorithm, usually FACTOPTNB=128 is a good choice
172*>          NTHREADS is the number of threads used when
173*>          openMP compilation is enabled, otherwise =1.
174*> \endverbatim
175*>
176*> \param[out] INFO
177*> \verbatim
178*>          INFO is INTEGER
179*>          = 0:  successful exit
180*>          < 0:  if INFO = -i, the i-th argument had an illegal value
181*> \endverbatim
182*
183*  Authors:
184*  ========
185*
186*> \author Univ. of Tennessee
187*> \author Univ. of California Berkeley
188*> \author Univ. of Colorado Denver
189*> \author NAG Ltd.
190*
191*> \date November 2017
192*
193*> \ingroup real16OTHERcomputational
194*
195*> \par Further Details:
196*  =====================
197*>
198*> \verbatim
199*>
200*>  Implemented by Azzam Haidar.
201*>
202*>  All details are available on technical report, SC11, SC13 papers.
203*>
204*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
205*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
206*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
207*>  of 2011 International Conference for High Performance Computing,
208*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
209*>  Article 8 , 11 pages.
210*>  http://doi.acm.org/10.1145/2063384.2063394
211*>
212*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
213*>  An improved parallel singular value algorithm and its implementation
214*>  for multicore hardware, In Proceedings of 2013 International Conference
215*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
216*>  Denver, Colorado, USA, 2013.
217*>  Article 90, 12 pages.
218*>  http://doi.acm.org/10.1145/2503210.2503292
219*>
220*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
221*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
222*>  calculations based on fine-grained memory aware tasks.
223*>  International Journal of High Performance Computing Applications.
224*>  Volume 28 Issue 2, Pages 196-209, May 2014.
225*>  http://hpc.sagepub.com/content/28/2/196
226*>
227*> \endverbatim
228*>
229*  =====================================================================
230      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
231     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
232*
233#if defined(_OPENMP)
234      use omp_lib
235#endif
236*
237      IMPLICIT NONE
238*
239*  -- LAPACK computational routine (version 3.8.0) --
240*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
241*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
242*     November 2017
243*
244*     .. Scalar Arguments ..
245      CHARACTER          STAGE1, UPLO, VECT
246      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
247*     ..
248*     .. Array Arguments ..
249      REAL               D( * ), E( * )
250      REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
251*     ..
252*
253*  =====================================================================
254*
255*     .. Parameters ..
256      REAL               RZERO
257      REAL               ZERO, ONE
258      PARAMETER          ( RZERO = 0.0E+0,
259     $                   ZERO = 0.0E+0,
260     $                   ONE  = 1.0E+0 )
261*     ..
262*     .. Local Scalars ..
263      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
264      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
265     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
266     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
267     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
268     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
269     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
270     $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
271*     ..
272*     .. External Subroutines ..
273      EXTERNAL           SSB2ST_KERNELS, SLACPY, SLASET, XERBLA
274*     ..
275*     .. Intrinsic Functions ..
276      INTRINSIC          MIN, MAX, CEILING, REAL
277*     ..
278*     .. External Functions ..
279      LOGICAL            LSAME
280      INTEGER            ILAENV2STAGE
281      EXTERNAL           LSAME, ILAENV2STAGE
282*     ..
283*     .. Executable Statements ..
284*
285*     Determine the minimal workspace size required.
286*     Test the input parameters
287*
288      DEBUG   = 0
289      INFO    = 0
290      AFTERS1 = LSAME( STAGE1, 'Y' )
291      WANTQ   = LSAME( VECT, 'V' )
292      UPPER   = LSAME( UPLO, 'U' )
293      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
294*
295*     Determine the block size, the workspace size and the hous size.
296*
297      IB     = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
298      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
299      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
300*
301      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
302         INFO = -1
303      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
304         INFO = -2
305      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
306         INFO = -3
307      ELSE IF( N.LT.0 ) THEN
308         INFO = -4
309      ELSE IF( KD.LT.0 ) THEN
310         INFO = -5
311      ELSE IF( LDAB.LT.(KD+1) ) THEN
312         INFO = -7
313      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
314         INFO = -11
315      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
316         INFO = -13
317      END IF
318*
319      IF( INFO.EQ.0 ) THEN
320         HOUS( 1 ) = LHMIN
321         WORK( 1 ) = LWMIN
322      END IF
323*
324      IF( INFO.NE.0 ) THEN
325         CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
326         RETURN
327      ELSE IF( LQUERY ) THEN
328         RETURN
329      END IF
330*
331*     Quick return if possible
332*
333      IF( N.EQ.0 ) THEN
334          HOUS( 1 ) = 1
335          WORK( 1 ) = 1
336          RETURN
337      END IF
338*
339*     Determine pointer position
340*
341      LDV      = KD + IB
342      SIZETAU  = 2 * N
343      SISEV    = 2 * N
344      INDTAU   = 1
345      INDV     = INDTAU + SIZETAU
346      LDA      = 2 * KD + 1
347      SIZEA    = LDA * N
348      INDA     = 1
349      INDW     = INDA + SIZEA
350      NTHREADS = 1
351      TID      = 0
352*
353      IF( UPPER ) THEN
354          APOS     = INDA + KD
355          AWPOS    = INDA
356          DPOS     = APOS + KD
357          OFDPOS   = DPOS - 1
358          ABDPOS   = KD + 1
359          ABOFDPOS = KD
360      ELSE
361          APOS     = INDA
362          AWPOS    = INDA + KD + 1
363          DPOS     = APOS
364          OFDPOS   = DPOS + 1
365          ABDPOS   = 1
366          ABOFDPOS = 2
367
368      ENDIF
369*
370*     Case KD=0:
371*     The matrix is diagonal. We just copy it (convert to "real" for
372*     real because D is double and the imaginary part should be 0)
373*     and store it in D. A sequential code here is better or
374*     in a parallel environment it might need two cores for D and E
375*
376      IF( KD.EQ.0 ) THEN
377          DO 30 I = 1, N
378              D( I ) = ( AB( ABDPOS, I ) )
379   30     CONTINUE
380          DO 40 I = 1, N-1
381              E( I ) = RZERO
382   40     CONTINUE
383*
384          HOUS( 1 ) = 1
385          WORK( 1 ) = 1
386          RETURN
387      END IF
388*
389*     Case KD=1:
390*     The matrix is already Tridiagonal. We have to make diagonal
391*     and offdiagonal elements real, and store them in D and E.
392*     For that, for real precision just copy the diag and offdiag
393*     to D and E while for the COMPLEX case the bulge chasing is
394*     performed to convert the hermetian tridiagonal to symmetric
395*     tridiagonal. A simpler coversion formula might be used, but then
396*     updating the Q matrix will be required and based if Q is generated
397*     or not this might complicate the story.
398*
399      IF( KD.EQ.1 ) THEN
400          DO 50 I = 1, N
401              D( I ) = ( AB( ABDPOS, I ) )
402   50     CONTINUE
403*
404          IF( UPPER ) THEN
405              DO 60 I = 1, N-1
406                 E( I ) = ( AB( ABOFDPOS, I+1 ) )
407   60         CONTINUE
408          ELSE
409              DO 70 I = 1, N-1
410                 E( I ) = ( AB( ABOFDPOS, I ) )
411   70         CONTINUE
412          ENDIF
413*
414          HOUS( 1 ) = 1
415          WORK( 1 ) = 1
416          RETURN
417      END IF
418*
419*     Main code start here.
420*     Reduce the symmetric band of A to a tridiagonal matrix.
421*
422      THGRSIZ   = N
423      GRSIZ     = 1
424      SHIFT     = 3
425      NBTILES   = CEILING( REAL(N)/REAL(KD) )
426      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
427      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
428*
429      CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
430      CALL SLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
431*
432*
433*     openMP parallelisation start here
434*
435#if defined(_OPENMP)
436!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
437!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
438!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
439!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
440!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
441!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
442!$OMP MASTER
443#endif
444*
445*     main bulge chasing loop
446*
447      DO 100 THGRID = 1, THGRNB
448          STT  = (THGRID-1)*THGRSIZ+1
449          THED = MIN( (STT + THGRSIZ -1), (N-1))
450          DO 110 I = STT, N-1
451              ED = MIN( I, THED )
452              IF( STT.GT.ED ) EXIT
453              DO 120 M = 1, STEPERCOL
454                  ST = STT
455                  DO 130 SWEEPID = ST, ED
456                      DO 140 K = 1, GRSIZ
457                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
458     $                           + (M-1)*GRSIZ + K
459                          IF ( MYID.EQ.1 ) THEN
460                              TTYPE = 1
461                          ELSE
462                              TTYPE = MOD( MYID, 2 ) + 2
463                          ENDIF
464
465                          IF( TTYPE.EQ.2 ) THEN
466                              COLPT      = (MYID/2)*KD + SWEEPID
467                              STIND      = COLPT-KD+1
468                              EDIND      = MIN(COLPT,N)
469                              BLKLASTIND = COLPT
470                          ELSE
471                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
472                              STIND      = COLPT-KD+1
473                              EDIND      = MIN(COLPT,N)
474                              IF( ( STIND.GE.EDIND-1 ).AND.
475     $                            ( EDIND.EQ.N ) ) THEN
476                                  BLKLASTIND = N
477                              ELSE
478                                  BLKLASTIND = 0
479                              ENDIF
480                          ENDIF
481*
482*                         Call the kernel
483*
484#if defined(_OPENMP) && _OPENMP >= 201307
485                          IF( TTYPE.NE.1 ) THEN
486!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
487!$OMP$     DEPEND(in:WORK(MYID-1))
488!$OMP$     DEPEND(out:WORK(MYID))
489                              TID      = OMP_GET_THREAD_NUM()
490                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
491     $                             STIND, EDIND, SWEEPID, N, KD, IB,
492     $                             WORK ( INDA ), LDA,
493     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
494     $                             WORK( INDW + TID*KD ) )
495!$OMP END TASK
496                          ELSE
497!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
498!$OMP$     DEPEND(out:WORK(MYID))
499                              TID      = OMP_GET_THREAD_NUM()
500                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
501     $                             STIND, EDIND, SWEEPID, N, KD, IB,
502     $                             WORK ( INDA ), LDA,
503     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
504     $                             WORK( INDW + TID*KD ) )
505!$OMP END TASK
506                          ENDIF
507#else
508                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
509     $                         STIND, EDIND, SWEEPID, N, KD, IB,
510     $                         WORK ( INDA ), LDA,
511     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
512     $                         WORK( INDW + TID*KD ) )
513#endif
514                          IF ( BLKLASTIND.GE.(N-1) ) THEN
515                              STT = STT + 1
516                              EXIT
517                          ENDIF
518  140                 CONTINUE
519  130             CONTINUE
520  120         CONTINUE
521  110     CONTINUE
522  100 CONTINUE
523*
524#if defined(_OPENMP)
525!$OMP END MASTER
526!$OMP END PARALLEL
527#endif
528*
529*     Copy the diagonal from A to D. Note that D is REAL thus only
530*     the Real part is needed, the imaginary part should be zero.
531*
532      DO 150 I = 1, N
533          D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
534  150 CONTINUE
535*
536*     Copy the off diagonal from A to E. Note that E is REAL thus only
537*     the Real part is needed, the imaginary part should be zero.
538*
539      IF( UPPER ) THEN
540          DO 160 I = 1, N-1
541             E( I ) = ( WORK( OFDPOS+I*LDA ) )
542  160     CONTINUE
543      ELSE
544          DO 170 I = 1, N-1
545             E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
546  170     CONTINUE
547      ENDIF
548*
549      HOUS( 1 ) = LHMIN
550      WORK( 1 ) = LWMIN
551      RETURN
552*
553*     End of SSYTRD_SB2ST
554*
555      END
556
557