1      SUBROUTINE PDSYTD2( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK,
2     $                    LWORK, INFO )
3*
4*  -- ScaLAPACK auxiliary routine (version 1.7) --
5*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6*     and University of California, Berkeley.
7*     May 1, 1997
8*
9*     .. Scalar Arguments ..
10      CHARACTER          UPLO
11      INTEGER            IA, INFO, JA, LWORK, N
12*     ..
13*     .. Array Arguments ..
14      INTEGER            DESCA( * )
15      DOUBLE PRECISION    A( * ), D( * ), E( * ), TAU( * ), WORK( * )
16*     ..
17*
18*  Purpose
19*  =======
20*
21*  PDSYTD2 reduces a real symmetric matrix sub( A ) to symmetric
22*  tridiagonal form T by an orthogonal similarity transformation:
23*  Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
24*
25*  Notes
26*  =====
27*
28*  Each global data object is described by an associated description
29*  vector.  This vector stores the information required to establish
30*  the mapping between an object element and its corresponding process
31*  and memory location.
32*
33*  Let A be a generic term for any 2D block cyclicly distributed array.
34*  Such a global array has an associated description vector DESCA.
35*  In the following comments, the character _ should be read as
36*  "of the global array".
37*
38*  NOTATION        STORED IN      EXPLANATION
39*  --------------- -------------- --------------------------------------
40*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
41*                                 DTYPE_A = 1.
42*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
43*                                 the BLACS process grid A is distribu-
44*                                 ted over. The context itself is glo-
45*                                 bal, but the handle (the integer
46*                                 value) may vary.
47*  M_A    (global) DESCA( M_ )    The number of rows in the global
48*                                 array A.
49*  N_A    (global) DESCA( N_ )    The number of columns in the global
50*                                 array A.
51*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
52*                                 the rows of the array.
53*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
54*                                 the columns of the array.
55*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
56*                                 row of the array A is distributed.
57*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
58*                                 first column of the array A is
59*                                 distributed.
60*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
61*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
62*
63*  Let K be the number of rows or columns of a distributed matrix,
64*  and assume that its process grid has dimension p x q.
65*  LOCr( K ) denotes the number of elements of K that a process
66*  would receive if K were distributed over the p processes of its
67*  process column.
68*  Similarly, LOCc( K ) denotes the number of elements of K that a
69*  process would receive if K were distributed over the q processes of
70*  its process row.
71*  The values of LOCr() and LOCc() may be determined via a call to the
72*  ScaLAPACK tool function, NUMROC:
73*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
74*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
75*  An upper bound for these quantities may be computed by:
76*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
77*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
78*
79*  Arguments
80*  =========
81*
82*  UPLO    (global input) CHARACTER
83*          Specifies whether the upper or lower triangular part of the
84*          symmetric matrix sub( A ) is stored:
85*          = 'U':  Upper triangular
86*          = 'L':  Lower triangular
87*
88*  N       (global input) INTEGER
89*          The number of rows and columns to be operated on, i.e. the
90*          order of the distributed submatrix sub( A ). N >= 0.
91*
92*  A       (local input/local output) DOUBLE PRECISION pointer into the
93*          local memory to an array of dimension (LLD_A,LOCc(JA+N-1)).
94*          On entry, this array contains the local pieces of the
95*          symmetric distributed matrix sub( A ).  If UPLO = 'U', the
96*          leading N-by-N upper triangular part of sub( A ) contains
97*          the upper triangular part of the matrix, and its strictly
98*          lower triangular part is not referenced. If UPLO = 'L', the
99*          leading N-by-N lower triangular part of sub( A ) contains the
100*          lower triangular part of the matrix, and its strictly upper
101*          triangular part is not referenced. On exit, if UPLO = 'U',
102*          the diagonal and first superdiagonal of sub( A ) are over-
103*          written by the corresponding elements of the tridiagonal
104*          matrix T, and the elements above the first superdiagonal,
105*          with the array TAU, represent the orthogonal matrix Q as a
106*          product of elementary reflectors; if UPLO = 'L', the diagonal
107*          and first subdiagonal of sub( A ) are overwritten by the
108*          corresponding elements of the tridiagonal matrix T, and the
109*          elements below the first subdiagonal, with the array TAU,
110*          represent the orthogonal matrix Q as a product of elementary
111*          reflectors. See Further Details.
112*
113*  IA      (global input) INTEGER
114*          The row index in the global array A indicating the first
115*          row of sub( A ).
116*
117*  JA      (global input) INTEGER
118*          The column index in the global array A indicating the
119*          first column of sub( A ).
120*
121*  DESCA   (global and local input) INTEGER array of dimension DLEN_.
122*          The array descriptor for the distributed matrix A.
123*
124*  D       (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1)
125*          The diagonal elements of the tridiagonal matrix T:
126*          D(i) = A(i,i). D is tied to the distributed matrix A.
127*
128*  E       (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1)
129*          if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal
130*          elements of the tridiagonal matrix T: E(i) = A(i,i+1) if
131*          UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the
132*          distributed matrix A.
133*
134*  TAU     (local output) DOUBLE PRECISION array, dimension
135*          LOCc(JA+N-1). This array contains the scalar factors TAU of
136*          the elementary reflectors. TAU is tied to the distributed
137*          matrix A.
138*
139*  WORK    (local workspace/local output) DOUBLE PRECISION array,
140*                                                    dimension (LWORK)
141*          On exit, WORK( 1 ) returns the minimal and optimal LWORK.
142*
143*  LWORK   (local or global input) INTEGER
144*          The dimension of the array WORK.
145*          LWORK is local input and must be at least
146*          LWORK >= 3*N.
147*
148*          If LWORK = -1, then LWORK is global input and a workspace
149*          query is assumed; the routine only calculates the minimum
150*          and optimal size for all work arrays. Each of these
151*          values is returned in the first entry of the corresponding
152*          work array, and no error message is issued by PXERBLA.
153*
154*  INFO    (local output) INTEGER
155*          = 0:  successful exit
156*          < 0:  If the i-th argument is an array and the j-entry had
157*                an illegal value, then INFO = -(i*100+j), if the i-th
158*                argument is a scalar and had an illegal value, then
159*                INFO = -i.
160*
161*  Further Details
162*  ===============
163*
164*  If UPLO = 'U', the matrix Q is represented as a product of elementary
165*  reflectors
166*
167*     Q = H(n-1) . . . H(2) H(1).
168*
169*  Each H(i) has the form
170*
171*     H(i) = I - tau * v * v'
172*
173*  where tau is a real scalar, and v is a real vector with
174*  v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
175*  A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1).
176*
177*  If UPLO = 'L', the matrix Q is represented as a product of elementary
178*  reflectors
179*
180*     Q = H(1) H(2) . . . H(n-1).
181*
182*  Each H(i) has the form
183*
184*     H(i) = I - tau * v * v'
185*
186*  where tau is a real scalar, and v is a real vector with
187*  v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in
188*  A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1).
189*
190*  The contents of sub( A ) on exit are illustrated by the following
191*  examples with n = 5:
192*
193*  if UPLO = 'U':                       if UPLO = 'L':
194*
195*    (  d   e   v2  v3  v4 )              (  d                  )
196*    (      d   e   v3  v4 )              (  e   d              )
197*    (          d   e   v4 )              (  v1  e   d          )
198*    (              d   e  )              (  v1  v2  e   d      )
199*    (                  d  )              (  v1  v2  v3  e   d  )
200*
201*  where d and e denote diagonal and off-diagonal elements of T, and vi
202*  denotes an element of the vector defining H(i).
203*
204*  Alignment requirements
205*  ======================
206*
207*  The distributed submatrix sub( A ) must verify some alignment proper-
208*  ties, namely the following expression should be true:
209*  ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) with
210*  IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ).
211*
212*  =====================================================================
213*
214*     .. Parameters ..
215      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
216     $                   LLD_, MB_, M_, NB_, N_, RSRC_
217      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
218     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
219     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
220      DOUBLE PRECISION   HALF, ONE, ZERO
221      PARAMETER          ( HALF = 0.5D+0, ONE = 1.0D+0, ZERO = 0.0D+0 )
222*     ..
223*     .. Local Scalars ..
224      LOGICAL            LQUERY, UPPER
225      INTEGER            IACOL, IAROW, ICOFFA, ICTXT, II, IK, IROFFA, J,
226     $                   JJ, JK, JN, LDA, LWMIN, MYCOL, MYROW, NPCOL,
227     $                   NPROW
228      DOUBLE PRECISION   ALPHA, TAUI
229*     ..
230*     .. External Subroutines ..
231      EXTERNAL           BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DAXPY,
232     $                   DGEBR2D, DGEBS2D, DLARFG,
233     $                   DSYMV, DSYR2, INFOG2L, PXERBLA
234*     ..
235*     .. External Functions ..
236      LOGICAL            LSAME
237      DOUBLE PRECISION   DDOT
238      EXTERNAL           LSAME, DDOT
239*     ..
240*     .. Intrinsic Functions ..
241      INTRINSIC          DBLE
242*     ..
243*     .. Executable Statements ..
244*
245*     Get grid parameters
246*
247      ICTXT = DESCA( CTXT_ )
248      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
249*
250*     Test the input parameters
251*
252      INFO = 0
253      IF( NPROW.EQ.-1 ) THEN
254         INFO = -(600+CTXT_)
255      ELSE
256         UPPER = LSAME( UPLO, 'U' )
257         CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO )
258         LWMIN = 3 * N
259*
260         WORK( 1 ) = DBLE( LWMIN )
261         LQUERY = ( LWORK.EQ.-1 )
262         IF( INFO.EQ.0 ) THEN
263            IROFFA = MOD( IA-1, DESCA( MB_ ) )
264            ICOFFA = MOD( JA-1, DESCA( NB_ ) )
265            IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
266               INFO = -1
267            ELSE IF( IROFFA.NE.ICOFFA ) THEN
268               INFO = -5
269            ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN
270               INFO = -(600+NB_)
271            ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
272               INFO = -11
273            END IF
274         END IF
275      END IF
276*
277      IF( INFO.NE.0 ) THEN
278         CALL PXERBLA( ICTXT, 'PDSYTD2', -INFO )
279         CALL BLACS_ABORT( ICTXT, 1 )
280         RETURN
281      ELSE IF( LQUERY ) THEN
282         RETURN
283      END IF
284*
285*     Quick return if possible
286*
287      IF( N.LE.0 )
288     $   RETURN
289*
290*     Compute local information
291*
292      LDA = DESCA( LLD_ )
293      CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ,
294     $              IAROW, IACOL )
295*
296      IF( UPPER ) THEN
297*
298*        Process(IAROW, IACOL) owns block to be reduced
299*
300         IF( MYCOL.EQ.IACOL ) THEN
301            IF( MYROW.EQ.IAROW ) THEN
302*
303*              Reduce the upper triangle of sub( A )
304*
305               DO 10 J = N-1, 1, -1
306                  IK = II + J - 1
307                  JK = JJ + J - 1
308*
309*                 Generate elementary reflector H(i) = I - tau * v * v'
310*                 to annihilate A(IA:IA+J-1,JA:JA+J-1)
311*
312                  CALL DLARFG( J, A( IK+JK*LDA ), A( II+JK*LDA ), 1,
313     $                         TAUI )
314                  E( JK+1 ) = A( IK+JK*LDA )
315*
316                  IF( TAUI.NE.ZERO ) THEN
317*
318*                    Apply H(i) from both sides to
319*                    A(IA:IA+J-1,JA:JA+J-1)
320*
321                     A( IK+JK*LDA ) = ONE
322*
323*                    Compute  x := tau * A * v  storing x in TAU(1:i)
324*
325                     CALL DSYMV( UPLO, J, TAUI, A( II+(JJ-1)*LDA ),
326     $                           LDA, A( II+JK*LDA ), 1, ZERO,
327     $                           TAU( JJ ), 1 )
328*
329*                    Compute  w := x - 1/2 * tau * (x'*v) * v
330*
331                     ALPHA = -HALF*TAUI*DDOT( J, TAU( JJ ), 1,
332     $                                        A( II+JK*LDA ), 1 )
333                     CALL DAXPY( J, ALPHA, A( II+JK*LDA ), 1,
334     $                           TAU( JJ ), 1 )
335*
336*                    Apply the transformation as a rank-2 update:
337*                       A := A - v * w' - w * v'
338*
339                     CALL DSYR2( UPLO, J, -ONE, A( II+JK*LDA ), 1,
340     $                           TAU( JJ ), 1, A( II+(JJ-1)*LDA ),
341     $                           LDA )
342                     A( IK+JK*LDA ) = E( JK+1 )
343                  END IF
344*
345*                 Copy D, E, TAU to broadcast them columnwise.
346*
347                  D( JK+1 ) = A( IK+1+JK*LDA )
348                  WORK( J+1 ) = D( JK+1 )
349                  WORK( N+J+1 ) = E( JK+1 )
350                  TAU( JK+1 ) = TAUI
351                  WORK( 2*N+J+1 ) = TAU( JK+1 )
352*
353   10          CONTINUE
354               D( JJ ) = A( II+(JJ-1)*LDA )
355               WORK( 1 ) = D( JJ )
356               WORK( N+1 ) = ZERO
357               WORK( 2*N+1 ) = ZERO
358*
359               CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1 )
360*
361            ELSE
362               CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1,
363     $                       IAROW, IACOL )
364               DO 20 J = 2, N
365                  JN = JJ + J - 1
366                  D( JN ) = WORK( J )
367                  E( JN ) = WORK( N+J )
368                  TAU( JN ) = WORK( 2*N+J )
369   20          CONTINUE
370               D( JJ ) = WORK( 1 )
371            END IF
372         END IF
373*
374      ELSE
375*
376*        Process (IAROW, IACOL) owns block to be factorized
377*
378         IF( MYCOL.EQ.IACOL ) THEN
379            IF( MYROW.EQ.IAROW ) THEN
380*
381*              Reduce the lower triangle of sub( A )
382*
383               DO 30 J = 1, N - 1
384                  IK = II + J - 1
385                  JK = JJ + J - 1
386*
387*                 Generate elementary reflector H(i) = I - tau * v * v'
388*                 to annihilate A(IA+J-JA+2:IA+N-1,JA+J-1)
389*
390                  CALL DLARFG( N-J, A( IK+1+(JK-1)*LDA ),
391     $                         A( IK+2+(JK-1)*LDA ), 1, TAUI )
392                  E( JK ) = A( IK+1+(JK-1)*LDA )
393*
394                  IF( TAUI.NE.ZERO ) THEN
395*
396*                    Apply H(i) from both sides to
397*                    A(IA+J-JA+1:IA+N-1,JA+J+1:JA+N-1)
398*
399                     A( IK+1+(JK-1)*LDA ) = ONE
400*
401*                    Compute  x := tau * A * v  storing y in TAU(i:n-1)
402*
403                     CALL DSYMV( UPLO, N-J, TAUI, A( IK+1+JK*LDA ),
404     $                           LDA, A( IK+1+(JK-1)*LDA ), 1,
405     $                           ZERO, TAU( JK ), 1 )
406*
407*                    Compute  w := x - 1/2 * tau * (x'*v) * v
408*
409                     ALPHA = -HALF*TAUI*DDOT( N-J, TAU( JK ), 1,
410     $                        A( IK+1+(JK-1)*LDA ), 1 )
411                     CALL DAXPY( N-J, ALPHA, A( IK+1+(JK-1)*LDA ),
412     $                           1, TAU( JK ), 1 )
413*
414*                    Apply the transformation as a rank-2 update:
415*                       A := A - v * w' - w * v'
416*
417                     CALL DSYR2( UPLO, N-J, -ONE,
418     $                           A( IK+1+(JK-1)*LDA ), 1,
419     $                           TAU( JK ), 1, A( IK+1+JK*LDA ),
420     $                           LDA )
421                     A( IK+1+(JK-1)*LDA ) = E( JK )
422                  END IF
423*
424*                 Copy D(JK), E(JK), TAU(JK) to broadcast them
425*                 columnwise.
426*
427                  D( JK ) = A( IK+(JK-1)*LDA )
428                  WORK( J ) = D( JK )
429                  WORK( N+J ) = E( JK )
430                  TAU( JK ) = TAUI
431                  WORK( 2*N+J ) = TAU( JK )
432   30          CONTINUE
433               JN = JJ + N - 1
434               D( JN ) = A( II+N-1+(JN-1)*LDA )
435               WORK( N ) = D( JN )
436               TAU( JN ) = ZERO
437               WORK( 2*N ) = ZERO
438*
439               CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK,
440     $                            1 )
441*
442            ELSE
443               CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK,
444     $                       1, IAROW, IACOL )
445               DO 40 J = 1, N - 1
446                  JN = JJ + J - 1
447                  D( JN ) = WORK( J )
448                  E( JN ) = WORK( N+J )
449                  TAU( JN ) = WORK( 2*N+J )
450   40          CONTINUE
451               JN = JJ + N - 1
452               D( JN ) = WORK( N )
453               TAU( JN ) = ZERO
454            END IF
455         END IF
456      END IF
457*
458      WORK( 1 ) = DBLE( LWMIN )
459*
460      RETURN
461*
462*     End of PDSYTD2
463*
464      END
465