1*> \brief \b SSYTRD_SB2ST reduces a real symmetric band matrix A to real symmetric tridiagonal form T
2*
3*  =========== DOCUMENTATION ===========
4*
5* Online html documentation available at
6*            http://www.netlib.org/lapack/explore-html/
7*
8*> \htmlonly
9*> Download SSYTRD_SB2ST + dependencies
10*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_sb2st.f">
11*> [TGZ]</a>
12*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_sb2st.f">
13*> [ZIP]</a>
14*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_sb2st.f">
15*> [TXT]</a>
16*> \endhtmlonly
17*
18*  Definition:
19*  ===========
20*
21*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
22*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
23*
24*       #if defined(_OPENMP)
25*       use omp_lib
26*       #endif
27*
28*       IMPLICIT NONE
29*
30*       .. Scalar Arguments ..
31*       CHARACTER          STAGE1, UPLO, VECT
32*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
33*       ..
34*       .. Array Arguments ..
35*       REAL               D( * ), E( * )
36*       REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
37*       ..
38*
39*
40*> \par Purpose:
41*  =============
42*>
43*> \verbatim
44*>
45*> SSYTRD_SB2ST reduces a real symmetric band matrix A to real symmetric
46*> tridiagonal form T by a orthogonal similarity transformation:
47*> Q**T * A * Q = T.
48*> \endverbatim
49*
50*  Arguments:
51*  ==========
52*
53*> \param[in] STAGE1
54*> \verbatim
55*>          STAGE1 is CHARACTER*1
56*>          = 'N':  "No": to mention that the stage 1 of the reduction
57*>                  from dense to band using the ssytrd_sy2sb routine
58*>                  was not called before this routine to reproduce AB.
59*>                  In other term this routine is called as standalone.
60*>          = 'Y':  "Yes": to mention that the stage 1 of the
61*>                  reduction from dense to band using the ssytrd_sy2sb
62*>                  routine has been called to produce AB (e.g., AB is
63*>                  the output of ssytrd_sy2sb.
64*> \endverbatim
65*>
66*> \param[in] VECT
67*> \verbatim
68*>          VECT is CHARACTER*1
69*>          = 'N':  No need for the Housholder representation,
70*>                  and thus LHOUS is of size max(1, 4*N);
71*>          = 'V':  the Householder representation is needed to
72*>                  either generate or to apply Q later on,
73*>                  then LHOUS is to be queried and computed.
74*>                  (NOT AVAILABLE IN THIS RELEASE).
75*> \endverbatim
76*>
77*> \param[in] UPLO
78*> \verbatim
79*>          UPLO is CHARACTER*1
80*>          = 'U':  Upper triangle of A is stored;
81*>          = 'L':  Lower triangle of A is stored.
82*> \endverbatim
83*>
84*> \param[in] N
85*> \verbatim
86*>          N is INTEGER
87*>          The order of the matrix A.  N >= 0.
88*> \endverbatim
89*>
90*> \param[in] KD
91*> \verbatim
92*>          KD is INTEGER
93*>          The number of superdiagonals of the matrix A if UPLO = 'U',
94*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
95*> \endverbatim
96*>
97*> \param[in,out] AB
98*> \verbatim
99*>          AB is REAL array, dimension (LDAB,N)
100*>          On entry, the upper or lower triangle of the symmetric band
101*>          matrix A, stored in the first KD+1 rows of the array.  The
102*>          j-th column of A is stored in the j-th column of the array AB
103*>          as follows:
104*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
105*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
106*>          On exit, the diagonal elements of AB are overwritten by the
107*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
108*>          elements on the first superdiagonal (if UPLO = 'U') or the
109*>          first subdiagonal (if UPLO = 'L') are overwritten by the
110*>          off-diagonal elements of T; the rest of AB is overwritten by
111*>          values generated during the reduction.
112*> \endverbatim
113*>
114*> \param[in] LDAB
115*> \verbatim
116*>          LDAB is INTEGER
117*>          The leading dimension of the array AB.  LDAB >= KD+1.
118*> \endverbatim
119*>
120*> \param[out] D
121*> \verbatim
122*>          D is REAL array, dimension (N)
123*>          The diagonal elements of the tridiagonal matrix T.
124*> \endverbatim
125*>
126*> \param[out] E
127*> \verbatim
128*>          E is REAL array, dimension (N-1)
129*>          The off-diagonal elements of the tridiagonal matrix T:
130*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
131*> \endverbatim
132*>
133*> \param[out] HOUS
134*> \verbatim
135*>          HOUS is REAL array, dimension LHOUS, that
136*>          store the Householder representation.
137*> \endverbatim
138*>
139*> \param[in] LHOUS
140*> \verbatim
141*>          LHOUS is INTEGER
142*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
143*>          If LWORK = -1, or LHOUS=-1,
144*>          then a query is assumed; the routine
145*>          only calculates the optimal size of the HOUS array, returns
146*>          this value as the first entry of the HOUS array, and no error
147*>          message related to LHOUS is issued by XERBLA.
148*>          LHOUS = MAX(1, dimension) where
149*>          dimension = 4*N if VECT='N'
150*>          not available now if VECT='H'
151*> \endverbatim
152*>
153*> \param[out] WORK
154*> \verbatim
155*>          WORK is REAL array, dimension LWORK.
156*> \endverbatim
157*>
158*> \param[in] LWORK
159*> \verbatim
160*>          LWORK is INTEGER
161*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
162*>          If LWORK = -1, or LHOUS=-1,
163*>          then a workspace query is assumed; the routine
164*>          only calculates the optimal size of the WORK array, returns
165*>          this value as the first entry of the WORK array, and no error
166*>          message related to LWORK is issued by XERBLA.
167*>          LWORK = MAX(1, dimension) where
168*>          dimension   = (2KD+1)*N + KD*NTHREADS
169*>          where KD is the blocking size of the reduction,
170*>          FACTOPTNB is the blocking used by the QR or LQ
171*>          algorithm, usually FACTOPTNB=128 is a good choice
172*>          NTHREADS is the number of threads used when
173*>          openMP compilation is enabled, otherwise =1.
174*> \endverbatim
175*>
176*> \param[out] INFO
177*> \verbatim
178*>          INFO is INTEGER
179*>          = 0:  successful exit
180*>          < 0:  if INFO = -i, the i-th argument had an illegal value
181*> \endverbatim
182*
183*  Authors:
184*  ========
185*
186*> \author Univ. of Tennessee
187*> \author Univ. of California Berkeley
188*> \author Univ. of Colorado Denver
189*> \author NAG Ltd.
190*
191*> \ingroup real16OTHERcomputational
192*
193*> \par Further Details:
194*  =====================
195*>
196*> \verbatim
197*>
198*>  Implemented by Azzam Haidar.
199*>
200*>  All details are available on technical report, SC11, SC13 papers.
201*>
202*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
203*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
204*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
205*>  of 2011 International Conference for High Performance Computing,
206*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
207*>  Article 8 , 11 pages.
208*>  http://doi.acm.org/10.1145/2063384.2063394
209*>
210*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
211*>  An improved parallel singular value algorithm and its implementation
212*>  for multicore hardware, In Proceedings of 2013 International Conference
213*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
214*>  Denver, Colorado, USA, 2013.
215*>  Article 90, 12 pages.
216*>  http://doi.acm.org/10.1145/2503210.2503292
217*>
218*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
219*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
220*>  calculations based on fine-grained memory aware tasks.
221*>  International Journal of High Performance Computing Applications.
222*>  Volume 28 Issue 2, Pages 196-209, May 2014.
223*>  http://hpc.sagepub.com/content/28/2/196
224*>
225*> \endverbatim
226*>
227*  =====================================================================
228      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
229     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
230*
231#if defined(_OPENMP)
232      use omp_lib
233#endif
234*
235      IMPLICIT NONE
236*
237*  -- LAPACK computational routine --
238*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
239*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
240*
241*     .. Scalar Arguments ..
242      CHARACTER          STAGE1, UPLO, VECT
243      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
244*     ..
245*     .. Array Arguments ..
246      REAL               D( * ), E( * )
247      REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
248*     ..
249*
250*  =====================================================================
251*
252*     .. Parameters ..
253      REAL               RZERO
254      REAL               ZERO, ONE
255      PARAMETER          ( RZERO = 0.0E+0,
256     $                   ZERO = 0.0E+0,
257     $                   ONE  = 1.0E+0 )
258*     ..
259*     .. Local Scalars ..
260      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
261      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
262     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
263     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
264     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
265     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
266     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
267     $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
268*     ..
269*     .. External Subroutines ..
270      EXTERNAL           SSB2ST_KERNELS, SLACPY, SLASET, XERBLA
271*     ..
272*     .. Intrinsic Functions ..
273      INTRINSIC          MIN, MAX, CEILING, REAL
274*     ..
275*     .. External Functions ..
276      LOGICAL            LSAME
277      INTEGER            ILAENV2STAGE
278      EXTERNAL           LSAME, ILAENV2STAGE
279*     ..
280*     .. Executable Statements ..
281*
282*     Determine the minimal workspace size required.
283*     Test the input parameters
284*
285      DEBUG   = 0
286      INFO    = 0
287      AFTERS1 = LSAME( STAGE1, 'Y' )
288      WANTQ   = LSAME( VECT, 'V' )
289      UPPER   = LSAME( UPLO, 'U' )
290      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
291*
292*     Determine the block size, the workspace size and the hous size.
293*
294      IB     = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
295      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
296      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
297*
298      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
299         INFO = -1
300      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
301         INFO = -2
302      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
303         INFO = -3
304      ELSE IF( N.LT.0 ) THEN
305         INFO = -4
306      ELSE IF( KD.LT.0 ) THEN
307         INFO = -5
308      ELSE IF( LDAB.LT.(KD+1) ) THEN
309         INFO = -7
310      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
311         INFO = -11
312      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
313         INFO = -13
314      END IF
315*
316      IF( INFO.EQ.0 ) THEN
317         HOUS( 1 ) = LHMIN
318         WORK( 1 ) = LWMIN
319      END IF
320*
321      IF( INFO.NE.0 ) THEN
322         CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
323         RETURN
324      ELSE IF( LQUERY ) THEN
325         RETURN
326      END IF
327*
328*     Quick return if possible
329*
330      IF( N.EQ.0 ) THEN
331          HOUS( 1 ) = 1
332          WORK( 1 ) = 1
333          RETURN
334      END IF
335*
336*     Determine pointer position
337*
338      LDV      = KD + IB
339      SIZETAU  = 2 * N
340      SISEV    = 2 * N
341      INDTAU   = 1
342      INDV     = INDTAU + SIZETAU
343      LDA      = 2 * KD + 1
344      SIZEA    = LDA * N
345      INDA     = 1
346      INDW     = INDA + SIZEA
347      NTHREADS = 1
348      TID      = 0
349*
350      IF( UPPER ) THEN
351          APOS     = INDA + KD
352          AWPOS    = INDA
353          DPOS     = APOS + KD
354          OFDPOS   = DPOS - 1
355          ABDPOS   = KD + 1
356          ABOFDPOS = KD
357      ELSE
358          APOS     = INDA
359          AWPOS    = INDA + KD + 1
360          DPOS     = APOS
361          OFDPOS   = DPOS + 1
362          ABDPOS   = 1
363          ABOFDPOS = 2
364
365      ENDIF
366*
367*     Case KD=0:
368*     The matrix is diagonal. We just copy it (convert to "real" for
369*     real because D is double and the imaginary part should be 0)
370*     and store it in D. A sequential code here is better or
371*     in a parallel environment it might need two cores for D and E
372*
373      IF( KD.EQ.0 ) THEN
374          DO 30 I = 1, N
375              D( I ) = ( AB( ABDPOS, I ) )
376   30     CONTINUE
377          DO 40 I = 1, N-1
378              E( I ) = RZERO
379   40     CONTINUE
380*
381          HOUS( 1 ) = 1
382          WORK( 1 ) = 1
383          RETURN
384      END IF
385*
386*     Case KD=1:
387*     The matrix is already Tridiagonal. We have to make diagonal
388*     and offdiagonal elements real, and store them in D and E.
389*     For that, for real precision just copy the diag and offdiag
390*     to D and E while for the COMPLEX case the bulge chasing is
391*     performed to convert the hermetian tridiagonal to symmetric
392*     tridiagonal. A simpler conversion formula might be used, but then
393*     updating the Q matrix will be required and based if Q is generated
394*     or not this might complicate the story.
395*
396      IF( KD.EQ.1 ) THEN
397          DO 50 I = 1, N
398              D( I ) = ( AB( ABDPOS, I ) )
399   50     CONTINUE
400*
401          IF( UPPER ) THEN
402              DO 60 I = 1, N-1
403                 E( I ) = ( AB( ABOFDPOS, I+1 ) )
404   60         CONTINUE
405          ELSE
406              DO 70 I = 1, N-1
407                 E( I ) = ( AB( ABOFDPOS, I ) )
408   70         CONTINUE
409          ENDIF
410*
411          HOUS( 1 ) = 1
412          WORK( 1 ) = 1
413          RETURN
414      END IF
415*
416*     Main code start here.
417*     Reduce the symmetric band of A to a tridiagonal matrix.
418*
419      THGRSIZ   = N
420      GRSIZ     = 1
421      SHIFT     = 3
422      NBTILES   = CEILING( REAL(N)/REAL(KD) )
423      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
424      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
425*
426      CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
427      CALL SLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
428*
429*
430*     openMP parallelisation start here
431*
432#if defined(_OPENMP)
433!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
434!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
435!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
436!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
437!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
438!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
439!$OMP MASTER
440#endif
441*
442*     main bulge chasing loop
443*
444      DO 100 THGRID = 1, THGRNB
445          STT  = (THGRID-1)*THGRSIZ+1
446          THED = MIN( (STT + THGRSIZ -1), (N-1))
447          DO 110 I = STT, N-1
448              ED = MIN( I, THED )
449              IF( STT.GT.ED ) EXIT
450              DO 120 M = 1, STEPERCOL
451                  ST = STT
452                  DO 130 SWEEPID = ST, ED
453                      DO 140 K = 1, GRSIZ
454                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
455     $                           + (M-1)*GRSIZ + K
456                          IF ( MYID.EQ.1 ) THEN
457                              TTYPE = 1
458                          ELSE
459                              TTYPE = MOD( MYID, 2 ) + 2
460                          ENDIF
461
462                          IF( TTYPE.EQ.2 ) THEN
463                              COLPT      = (MYID/2)*KD + SWEEPID
464                              STIND      = COLPT-KD+1
465                              EDIND      = MIN(COLPT,N)
466                              BLKLASTIND = COLPT
467                          ELSE
468                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
469                              STIND      = COLPT-KD+1
470                              EDIND      = MIN(COLPT,N)
471                              IF( ( STIND.GE.EDIND-1 ).AND.
472     $                            ( EDIND.EQ.N ) ) THEN
473                                  BLKLASTIND = N
474                              ELSE
475                                  BLKLASTIND = 0
476                              ENDIF
477                          ENDIF
478*
479*                         Call the kernel
480*
481#if defined(_OPENMP) && _OPENMP >= 201307
482                          IF( TTYPE.NE.1 ) THEN
483!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
484!$OMP$     DEPEND(in:WORK(MYID-1))
485!$OMP$     DEPEND(out:WORK(MYID))
486                              TID      = OMP_GET_THREAD_NUM()
487                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
488     $                             STIND, EDIND, SWEEPID, N, KD, IB,
489     $                             WORK ( INDA ), LDA,
490     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
491     $                             WORK( INDW + TID*KD ) )
492!$OMP END TASK
493                          ELSE
494!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
495!$OMP$     DEPEND(out:WORK(MYID))
496                              TID      = OMP_GET_THREAD_NUM()
497                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
498     $                             STIND, EDIND, SWEEPID, N, KD, IB,
499     $                             WORK ( INDA ), LDA,
500     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
501     $                             WORK( INDW + TID*KD ) )
502!$OMP END TASK
503                          ENDIF
504#else
505                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
506     $                         STIND, EDIND, SWEEPID, N, KD, IB,
507     $                         WORK ( INDA ), LDA,
508     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
509     $                         WORK( INDW + TID*KD ) )
510#endif
511                          IF ( BLKLASTIND.GE.(N-1) ) THEN
512                              STT = STT + 1
513                              EXIT
514                          ENDIF
515  140                 CONTINUE
516  130             CONTINUE
517  120         CONTINUE
518  110     CONTINUE
519  100 CONTINUE
520*
521#if defined(_OPENMP)
522!$OMP END MASTER
523!$OMP END PARALLEL
524#endif
525*
526*     Copy the diagonal from A to D. Note that D is REAL thus only
527*     the Real part is needed, the imaginary part should be zero.
528*
529      DO 150 I = 1, N
530          D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
531  150 CONTINUE
532*
533*     Copy the off diagonal from A to E. Note that E is REAL thus only
534*     the Real part is needed, the imaginary part should be zero.
535*
536      IF( UPPER ) THEN
537          DO 160 I = 1, N-1
538             E( I ) = ( WORK( OFDPOS+I*LDA ) )
539  160     CONTINUE
540      ELSE
541          DO 170 I = 1, N-1
542             E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
543  170     CONTINUE
544      ENDIF
545*
546      HOUS( 1 ) = LHMIN
547      WORK( 1 ) = LWMIN
548      RETURN
549*
550*     End of SSYTRD_SB2ST
551*
552      END
553
554