1      SUBROUTINE PDSYTTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK,
2     $                     LWORK, INFO )
3*
4*  -- ScaLAPACK routine (version 2.0.2) --
5*     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6*     May 1 2012
7*
8*     .. Scalar Arguments ..
9      CHARACTER          UPLO
10      INTEGER            IA, INFO, JA, LWORK, N
11*     ..
12*     .. Array Arguments ..
13      INTEGER            DESCA( * )
14      DOUBLE PRECISION   A( * ), D( * ), E( * ), TAU( * ), WORK( * )
15*     ..
16*
17*     Purpose
18*
19*     =======
20*
21*     PDSYTTRD reduces a complex Hermitian matrix sub( A ) to Hermitian
22*     tridiagonal form T by an unitary similarity transformation:
23*     Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
24*
25*     Notes
26*     =====
27*
28*     Each global data object is described by an associated description
29*     vector.  This vector stores the information required to establish
30*     the mapping between an object element and its corresponding
31*     process and memory location.
32*
33*     Let A be a generic term for any 2D block cyclicly distributed
34*     array.
35*     Such a global array has an associated description vector DESCA.
36*     In the following comments, the character _ should be read as
37*     "of the global array".
38*
39*     NOTATION        STORED IN      EXPLANATION
40*     --------------- -------------- -----------------------------------
41*     DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
42*     DTYPE_A = 1.
43*     CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle,
44*     indicating the BLACS process grid A is distribu-
45*     ted over. The context itself is glo-
46*     bal, but the handle (the integer
47*     value) may vary.
48*     M_A    (global) DESCA( M_ )    The number of rows in the global
49*     array A.
50*     N_A    (global) DESCA( N_ )    The number of columns in the global
51*     array A.
52*     MB_A   (global) DESCA( MB_ )   The blocking factor used to
53*     distribute the rows of the array.
54*     NB_A   (global) DESCA( NB_ )   The blocking factor used to
55*     distribute the columns of the array.
56*     RSRC_A (global) DESCA( RSRC_ ) The process row over which the
57*     first row of the array A is distributed.
58*     CSRC_A (global) DESCA( CSRC_ ) The process column over which the
59*     first column of the array A is
60*     distributed.
61*     LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
62*     array.  LLD_A >= MAX(1,LOCp(M_A)).
63*
64*     Let K be the number of rows or columns of a distributed matrix,
65*     and assume that its process grid has dimension p x q.
66*     LOCp( K ) denotes the number of elements of K that a process
67*     would receive if K were distributed over the p processes of its
68*     process column.
69*     Similarly, LOCq( K ) denotes the number of elements of K that a
70*     process would receive if K were distributed over the q processes
71*     of its process row.
72*     The values of LOCp() and LOCq() may be determined via a call to
73*     the ScaLAPACK tool function, NUMROC:
74*     LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
75*     LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
76*     An upper bound for these quantities may be computed by:
77*     LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
78*     LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
79*
80*     Arguments
81*     =========
82*
83*     UPLO    (global input) CHARACTER
84*     Specifies whether the upper or lower triangular part of the
85*     Hermitian matrix sub( A ) is stored:
86*     = 'U':  Upper triangular
87*     = 'L':  Lower triangular
88*
89*     N       (global input) INTEGER
90*     The number of rows and columns to be operated on, i.e. the
91*     order of the distributed submatrix sub( A ). N >= 0.
92*
93*     A  (local input/local output) DOUBLE PRECISION pointer into the
94*     local memory to an array of dimension (LLD_A,LOCq(JA+N-1)).
95*     On entry, this array contains the local pieces of the
96*     Hermitian distributed matrix sub( A ).  If UPLO = 'U', the
97*     leading N-by-N upper triangular part of sub( A ) contains
98*     the upper triangular part of the matrix, and its strictly
99*     lower triangular part is not referenced. If UPLO = 'L', the
100*     leading N-by-N lower triangular part of sub( A ) contains the
101*     lower triangular part of the matrix, and its strictly upper
102*     triangular part is not referenced. On exit, if UPLO = 'U',
103*     the diagonal and first superdiagonal of sub( A ) are over-
104*     written by the corresponding elements of the tridiagonal
105*     matrix T, and the elements above the first superdiagonal,
106*     with the array TAU, represent the unitary matrix Q as a
107*     product of elementary reflectors; if UPLO = 'L', the diagonal
108*     and first subdiagonal of sub( A ) are overwritten by the
109*     corresponding elements of the tridiagonal matrix T, and the
110*     elements below the first subdiagonal, with the array TAU,
111*     represent the unitary matrix Q as a product of elementary
112*     reflectors. See Further Details.
113*
114*     IA      (global input) INTEGER
115*     The row index in the global array A indicating the first
116*     row of sub( A ).
117*
118*     JA      (global input) INTEGER
119*     The column index in the global array A indicating the
120*     first column of sub( A ).
121*
122*     DESCA   (global and local input) INTEGER array of dimension DLEN_.
123*     The array descriptor for the distributed matrix A.
124*
125*     D       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)
126*     The diagonal elements of the tridiagonal matrix T:
127*     D(i) = A(i,i). D is tied to the distributed matrix A.
128*
129*     E       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)
130*     if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal
131*     elements of the tridiagonal matrix T: E(i) = A(i,i+1) if
132*     UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the
133*     distributed matrix A.
134*
135*     TAU     (local output) DOUBLE PRECISION array, dimension
136*     LOCq(JA+N-1). This array contains the scalar factors TAU of
137*     the elementary reflectors. TAU is tied to the distributed
138*     matrix A.
139*
140*     WORK  (local workspace) DOUBLE PRECISION array, dimension (LWORK)
141*     On exit, WORK( 1 ) returns the minimal and optimal workspace
142*
143*     LWORK   (local input) INTEGER
144*     The dimension of the array WORK.
145*     LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS
146*     Where:
147*         NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB )
148*         ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PDSYTTRD', 'L', 0, 0,
149*           0, 0 )
150*
151*         NUMROC is a ScaLAPACK tool function;
152*         PJLAENV is a ScaLAPACK envionmental inquiry function
153*         MYROW, MYCOL, NPROW and NPCOL can be determined by calling
154*         the subroutine BLACS_GRIDINFO.
155*
156*     INFO    (global output) INTEGER
157*     = 0:  successful exit
158*     < 0:  If the i-th argument is an array and the j-entry had
159*     an illegal value, then INFO = -(i*100+j), if the i-th
160*     argument is a scalar and had an illegal value, then
161*     INFO = -i.
162*
163*     Further Details
164*     ===============
165*
166*     If UPLO = 'U', the matrix Q is represented as a product of
167*     elementary reflectors
168*
169*     Q = H(n-1) . . . H(2) H(1).
170*
171*     Each H(i) has the form
172*
173*     H(i) = I - tau * v * v'
174*
175*     where tau is a complex scalar, and v is a complex vector with
176*     v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
177*     A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1).
178*
179*     If UPLO = 'L', the matrix Q is represented as a product of
180*     elementary reflectors
181*
182*     Q = H(1) H(2) . . . H(n-1).
183*
184*     Each H(i) has the form
185*
186*     H(i) = I - tau * v * v'
187*
188*     where tau is a complex scalar, and v is a complex vector with
189*     v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in
190*     A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1).
191*
192*     The contents of sub( A ) on exit are illustrated by the following
193*     examples with n = 5:
194*
195*     if UPLO = 'U':                       if UPLO = 'L':
196*
197*     (  d   e   v2  v3  v4 )              (  d                  )
198*     (      d   e   v3  v4 )              (  e   d              )
199*     (          d   e   v4 )              (  v1  e   d          )
200*     (              d   e  )              (  v1  v2  e   d      )
201*     (                  d  )              (  v1  v2  v3  e   d  )
202*
203*     where d and e denote diagonal and off-diagonal elements of T, and
204*     vi denotes an element of the vector defining H(i).
205*
206*     Data storage requirements
207*     =========================
208*
209*     PDSYTTRD is not intended to be called directly.  All users are
210*     encourage to call PDSYTRD which will then call PDHETTRD if
211*     appropriate.  A must be in cyclic format (i.e. MB = NB = 1),
212*     the process grid must be square ( i.e. NPROW = NPCOL ) and
213*     only lower triangular storage is supported.
214*
215*     Local variables
216*     ===============
217*
218*     PDSYTTRD uses five local arrays:
219*       WORK ( InV ) dimension ( NP, ANB+1): array V
220*       WORK ( InH ) dimension ( NP, ANB+1): array H
221*       WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V
222*       WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H
223*       WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT
224*
225*     Arrays V and H are replicated across all processor columns.
226*     Arrays V^T and H^T are replicated across all processor rows.
227*
228*         WORK ( InVT ), or V^T, is stored as a tall skinny
229*         array ( NQ x ANB-1 ) for efficiency.  Since only the lower
230*         triangular portion of A is updated, Av is computed as:
231*         tril(A) * v + v^T * tril(A,-1).  This is performed as
232*         two local triangular matrix-vector multiplications (both in
233*         MVR2) followed by a transpose and a sum across the columns.
234*         In the local computation, WORK( InVT ) is used to compute
235*         tril(A) * v and WORK( InV ) is used to compute
236*         v^T * tril(A,-1)
237*
238*     The following variables are global indices into A:
239*       INDEX:  The current global row and column number.
240*       MAXINDEX:  The global row and column for the first row and
241*       column in the trailing block of A.
242*       LIIB, LIJB:  The first row, column in
243*
244*     The following variables point into the arrays A, V, H, V^T, H^T:
245*       BINDEX  =INDEX-MININDEX: The column index in V, H, V^T, H^T.
246*       LII:  local index I:  The local row number for row INDEX
247*       LIJ:  local index J:  The local column number for column INDEX
248*       LIIP1:  local index I+1:  The local row number for row INDEX+1
249*       LIJP1:  local index J+1:  The local col number for col INDEX+1
250*       LTLI: lower triangular local index I:  The local row for the
251*         upper left entry in tril( A(INDEX, INDEX) )
252*       LTLIP1: lower triangular local index I+1:  The local row for the
253*         upper left entry in tril( A(INDEX+1, INDEX+1) )
254*
255*         Details:  The distinction between LII and LTLI (and between
256*         LIIP1 and LTLIP1) is subtle.  Within the current processor
257*         column (i.e. MYCOL .eq. CURCOL) they are the same.  However,
258*         on some processors, A( LII, LIJ ) points to an element
259*         above the diagonal, on these processors, LTLI = LII+1.
260*
261*     The following variables give the number of rows and/or columns
262*     in various matrices:
263*       NP:  The number of local rows in A( 1:N, 1:N )
264*       NQ:  The number of local columns in A( 1:N, 1:N )
265*       NPM0:  The number of local rows in A( INDEX:N, INDEX:N )
266*       NQM0:  The number of local columns in A( INDEX:N, INDEX:N )
267*       NPM1:  The number of local rows in A( INDEX+1:N, INDEX:N )
268*       NQM1:  The number of local columns in A( INDEX+1:N, INDEX:N )
269*       LTNM0:  The number of local rows & columns in
270*         tril( A( INDEX:N, INDEX:N ) )
271*       LTNM1:  The number of local rows & columns in
272*         tril( A( INDEX+1:N, INDEX+1:N ) )
273*         NOTE:  LTNM0 == LTNM1 on all processors except the diagonal
274*         processors, i.e. those where MYCOL == MYROW.
275*
276*         Invariants:
277*           NP = NPM0 + LII - 1
278*           NQ = NQM0 + LIJ - 1
279*           NP = NPM1 + LIIP1 - 1
280*           NQ = NQM1 + LIJP1 - 1
281*           NP = LTLI + LTNM0 - 1
282*           NP = LTLIP1 + LTNM1 - 1
283*
284*       Temporary variables.  The following variables are used within
285*       a few lines after they are set and do hold state from one loop
286*       iteration to the next:
287*
288*     The matrix A:
289*       The matrix A does not hold the same values that it would
290*       in an unblocked code nor the values that it would hold in
291*       in a blocked code.
292*
293*       The value of A is confusing.  It is easiest to state the
294*       difference between trueA and A at the point that MVR2 is called,
295*       so we will start there.
296*
297*       Let trueA be the value that A would
298*       have at a given point in an unblocked code and A
299*       be the value that A has in this code at the same point.
300*
301*       At the time of the call to MVR2,
302*       trueA = A + V' * H + H' * V
303*       where H = H( MAXINDEX:N, 1:BINDEX ) and
304*       V = V( MAXINDEX:N, 1:BINDEX ).
305*
306*       At the bottom of the inner loop,
307*       trueA = A +  V' * H + H' * V + v' * h + h' * v
308*       where H = H( MAXINDEX:N, 1:BINDEX ) and
309*       V = V( MAXINDEX:N, 1:BINDEX ) and
310*       v = V( liip1:N, BINDEX+1 ) and
311*       h = H( liip1:N, BINDEX+1 )
312*
313*       At the top of the loop, BINDEX gets incremented, hence:
314*       trueA = A +  V' * H + H' * V + v' * h + h' * v
315*       where H = H( MAXINDEX:N, 1:BINDEX-1 ) and
316*       V = V( MAXINDEX:N, 1:BINDEX-1 ) and
317*       v = V( liip1:N, BINDEX ) and
318*       h = H( liip1:N, BINDEX )
319*
320*
321*       A gets updated at the bottom of the outer loop
322*       After this update, trueA = A + v' * h + h' * v
323*       where v = V( liip1:N, BINDEX ) and
324*       h = H( liip1:N, BINDEX ) and BINDEX = 0
325*       Indeed, the previous loop invariant as stated above for the
326*       top of the loop still holds, but with BINDEX = 0, H and V
327*       are null matrices.
328*
329*       After the current column of A is updated,
330*         trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N )
331*       the rest of A is untouched.
332*
333*       After the current block column of A is updated,
334*       trueA = A + V' * H + H' * V
335*       where H = H( MAXINDEX:N, 1:BINDEX ) and
336*       V = V( MAXINDEX:N, 1:BINDEX )
337*
338*       This brings us back to the point at which mvr2 is called.
339*
340*
341*     Details of the parallelization:
342*
343*       We delay spreading v across to all processor columns (which
344*       would naturally happen at the bottom of the loop) in order to
345*       combine the spread of v( : , i-1 ) with the spread of h( : , i )
346*
347*       In order to compute h( :, i ), we must update A( :, i )
348*       which means that the processor column owning A( :, i ) must
349*       have: c, tau, v( i, i ) and h( i, i ).
350*
351*       The traditional
352*       way of computing v (and the one used in pzlatrd.f and
353*       zlatrd.f) is:
354*         v = tau * v
355*         c = v' * h
356*         alpha = - tau * c / 2
357*         v = v + alpha * h
358*       However, the traditional way of computing v requires that tau
359*       be broadcast to all processors in the current column (to compute
360*       v = tau * v) and then a sum-to-all is required (to
361*       compute v' * h ).  We use the following formula instead:
362*         c = v' * h
363*         v = tau * ( v - c * tau' * h / 2 )
364*       The above formula allows tau to be spread down in the
365*       same call to DGSUM2D which performs the sum-to-all of c.
366*
367*       The computation of v, which could be performed in any processor
368*       column (or other procesor subsets), is performed in the
369*       processor column that owns A( :, i+1 ) so that A( :, i+1 )
370*       can be updated prior to spreading v across.
371*
372*       We keep the block column of A up-to-date to minimize the
373*       work required in updating the current column of A.  Updating
374*       the block column of A is reasonably load balanced whereas
375*       updating the current column of A is not (only the current
376*       processor column is involved).
377*
378*     In the following overview of the steps performed, M in the
379*     margin indicates message traffic and C indicates O(n^2 nb/sqrt(p))
380*     or more flops per processor.
381*
382*     Inner loop:
383*       A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) )
384*M      h = house( A(index:n, index) )
385*M      Spread v, h across
386*M      vt = v^T; ht = h^T
387*       A( index+1:n, index+1:maxindex ) -=
388*         ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) )
389*C      v = tril(A) * h; vt = ht * tril(A,-1)
390*MorC   v = v - H*V*h - V*H*h
391*M      v = v + vt^T
392*M      c = v' * h
393*       v = tau * ( v - c * tau' * h / 2 )
394*C    A = A - H*V - V*H
395*
396*
397*
398*     =================================================================
399*
400*     .. Parameters ..
401      INTEGER            BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,
402     $                   MB_, NB_, RSRC_, CSRC_, LLD_
403      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
404     $                   CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
405     $                   RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
406      DOUBLE PRECISION   ONE
407      PARAMETER          ( ONE = 1.0D0 )
408      DOUBLE PRECISION   Z_ONE, Z_NEGONE, Z_ZERO
409      PARAMETER          ( Z_ONE = 1.0D0, Z_NEGONE = -1.0D0,
410     $                   Z_ZERO = 0.0D0 )
411      DOUBLE PRECISION   ZERO
412      PARAMETER          ( ZERO = 0.0D+0 )
413*     ..
414*
415*
416*     .. Local Scalars ..
417*
418*
419      LOGICAL            BALANCED, INTERLEAVE, TWOGEMMS, UPPER
420      INTEGER            ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX,
421     $                   INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT,
422     $                   INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA,
423     $                   LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1,
424     $                   LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX,
425     $                   MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP,
426     $                   NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ,
427     $                   NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX,
428     $                   PBMIN, PBSIZE, PNB, ROWSPERPROC
429      DOUBLE PRECISION   ALPHA, BETA, C, CONJTOPH, CONJTOPV, NORM,
430     $                   ONEOVERBETA, SAFMAX, SAFMIN, TOPH, TOPNV,
431     $                   TOPTAU, TOPV
432*     ..
433*     .. Local Arrays ..
434*
435*
436*
437*
438      INTEGER            IDUM1( 1 ), IDUM2( 1 )
439      DOUBLE PRECISION   CC( 3 ), DTMP( 5 )
440*     ..
441*     .. External Subroutines ..
442      EXTERNAL           BLACS_GRIDINFO, CHK1MAT, DCOMBNRM2, DGEBR2D,
443     $                   DGEBS2D, DGEMM, DGEMV, DGERV2D, DGESD2D,
444     $                   DGSUM2D, DLAMOV, DSCAL, DTRMVT, PCHK1MAT,
445     $                   PDTREECOMB, PXERBLA
446*     ..
447*     .. External Functions ..
448*
449      LOGICAL            LSAME
450      INTEGER            ICEIL, NUMROC, PJLAENV
451      DOUBLE PRECISION   DNRM2, PDLAMCH
452      EXTERNAL           LSAME, ICEIL, NUMROC, PJLAENV, DNRM2, PDLAMCH
453*     ..
454*     .. Intrinsic Functions ..
455      INTRINSIC          DBLE, ICHAR, MAX, MIN, MOD, SIGN, SQRT
456*     ..
457*
458*
459*     .. Executable Statements ..
460*       This is just to keep ftnchek and toolpack/1 happy
461      IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_*
462     $    RSRC_.LT.0 )RETURN
463*
464*
465*
466*     Further details
467*     ===============
468*
469*     At the top of the loop, v and nh have been computed but not
470*     spread across.  Hence, A is out-of-date even after the
471*     rank 2k update.  Furthermore, we compute the next v before
472*     nh is spread across.
473*
474*     I claim that if we used a sum-to-all on NV, by summing CC within
475*     each column, that we could compute NV locally and could avoid
476*     spreading V across.  Bruce claims that sum-to-all can be made
477*     to cost no more than sum-to-one on the Paragon.  If that is
478*     true, this would be a win.  But,
479*     the BLACS sum-to-all is just a sum-to-one followed by a broadcast,
480*     and hence the present scheme is better for now.
481*
482*     Get grid parameters
483*
484      ICTXT = DESCA( CTXT_ )
485      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
486*
487      SAFMAX = SQRT( PDLAMCH( ICTXT, 'O' ) ) / N
488      SAFMIN = SQRT( PDLAMCH( ICTXT, 'S' ) )
489*
490*     Test the input parameters
491*
492      INFO = 0
493      IF( NPROW.EQ.-1 ) THEN
494         INFO = -( 600+CTXT_ )
495      ELSE
496*
497*     Here we set execution options for PDSYTTRD
498*
499         PNB = PJLAENV( ICTXT, 2, 'PDSYTTRD', 'L', 0, 0, 0, 0 )
500         ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 )
501*
502         INTERLEAVE = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 1, 0, 0,
503     $                0 ).EQ.1 )
504         TWOGEMMS = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 2, 0, 0,
505     $              0 ).EQ.1 )
506         BALANCED = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 3, 0, 0,
507     $              0 ).EQ.1 )
508*
509         CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO )
510*
511*
512         UPPER = LSAME( UPLO, 'U' )
513         IF( INFO.EQ.0 .AND. DESCA( NB_ ).NE.1 )
514     $      INFO = 600 + NB_
515         IF( INFO.EQ.0 ) THEN
516*
517*
518*           Here is the arithmetic:
519*             Let maxnpq = max( np, nq, 2 * ANB )
520*             LDV = 4 * max( np, nq ) + 2
521*             LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB )
522*             = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS
523*
524*           This overestimates memory requirements when ANB > NP/2
525*           Memory requirements are lower when interleave = .false.
526*           Hence, we could have two sets of memory requirements,
527*           one for interleave and one for
528*
529*
530            NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB )
531            LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS
532*
533            WORK( 1 ) = DBLE( LWMIN )
534            IF( .NOT.LSAME( UPLO, 'L' ) ) THEN
535               INFO = -1
536            ELSE IF( IA.NE.1 ) THEN
537               INFO = -4
538            ELSE IF( JA.NE.1 ) THEN
539               INFO = -5
540            ELSE IF( NPROW.NE.NPCOL ) THEN
541               INFO = -( 600+CTXT_ )
542            ELSE IF( DESCA( DTYPE_ ).NE.1 ) THEN
543               INFO = -( 600+DTYPE_ )
544            ELSE IF( DESCA( MB_ ).NE.1 ) THEN
545               INFO = -( 600+MB_ )
546            ELSE IF( DESCA( NB_ ).NE.1 ) THEN
547               INFO = -( 600+NB_ )
548            ELSE IF( DESCA( RSRC_ ).NE.0 ) THEN
549               INFO = -( 600+RSRC_ )
550            ELSE IF( DESCA( CSRC_ ).NE.0 ) THEN
551               INFO = -( 600+CSRC_ )
552            ELSE IF( LWORK.LT.LWMIN ) THEN
553               INFO = -11
554            END IF
555         END IF
556         IF( UPPER ) THEN
557            IDUM1( 1 ) = ICHAR( 'U' )
558         ELSE
559            IDUM1( 1 ) = ICHAR( 'L' )
560         END IF
561         IDUM2( 1 ) = 1
562*
563         CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2,
564     $                  INFO )
565      END IF
566*
567      IF( INFO.NE.0 ) THEN
568         CALL PXERBLA( ICTXT, 'PDSYTTRD', -INFO )
569         RETURN
570      END IF
571*
572*     Quick return if possible
573*
574      IF( N.EQ.0 )
575     $   RETURN
576*
577*
578*
579*     Reduce the lower triangle of sub( A )
580      NP = NUMROC( N, 1, MYROW, 0, NPROW )
581      NQ = NUMROC( N, 1, MYCOL, 0, NPCOL )
582*
583      NXTROW = 0
584      NXTCOL = 0
585*
586      LIIP1 = 1
587      LIJP1 = 1
588      NPM1 = NP
589      NQM1 = NQ
590*
591      LDA = DESCA( LLD_ )
592      ICTXT = DESCA( CTXT_ )
593*
594*
595*
596*     Miscellaneous details:
597*     Put tau, D and E in the right places
598*     Check signs
599*     Place all the arrays in WORK, control their placement
600*     in  memory.
601*
602*
603*
604*     Loop invariants
605*     A(LIIP1, LIJ) points to the first element of A(I+1,J)
606*     NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N )
607*     A(LII:N,LIJ:N) is one step out of date.
608*     proc( CURROW, CURCOL ) owns A(LII,LIJ)
609*     proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ)
610*
611      INH = 1
612*
613      IF( INTERLEAVE ) THEN
614*
615*        H and V are interleaved to minimize memory movement
616*        LDV has to be twice as large to accomodate interleaving.
617*        In addition, LDV is doubled again to allow v, h and
618*        toptau to be spreaad across and transposed in a
619*        single communication operation with minimum memory
620*        movement.
621*
622*        We could reduce LDV back to 2*MAX(NPM1,NQM1)
623*        by increasing the memory movement required in
624*        the spread and transpose of v, h and toptau.
625*        However, since the non-interleaved path already
626*        provides a mear minimum memory requirement option,
627*        we did not provide this additional path.
628*
629         LDV = 4*( MAX( NPM1, NQM1 ) ) + 2
630*
631         INH = 1
632*
633         INV = INH + LDV / 2
634         INVT = INH + ( ANB+1 )*LDV
635*
636         INHT = INVT + LDV / 2
637         INTMP = INVT + LDV*( ANB+1 )
638*
639      ELSE
640         LDV = MAX( NPM1, NQM1 )
641*
642         INHT = INH + LDV*( ANB+1 )
643         INV = INHT + LDV*( ANB+1 )
644*
645*        The code works without this +1, but only because of a
646*        coincidence.  Without the +1, WORK(INVT) gets trashed, but
647*        WORK(INVT) is only used once and when it is used, it is
648*        multiplied by WORK( INH ) which is zero.  Hence, the fact
649*        that WORK(INVT) is trashed has no effect.
650*
651         INVT = INV + LDV*( ANB+1 ) + 1
652         INTMP = INVT + LDV*( 2*ANB )
653*
654      END IF
655*
656      IF( INFO.NE.0 ) THEN
657         CALL PXERBLA( ICTXT, 'PDSYTTRD', -INFO )
658         WORK( 1 ) = DBLE( LWMIN )
659         RETURN
660      END IF
661*
662*
663*        The satisfies the loop invariant: trueA = A - V * HT - H * VT,
664*        (where V, H, VT and HT all have BINDEX+1 rows/columns)
665*        the first ANB times through the loop.
666*
667*
668*
669*     Setting either ( InH and InHT ) or InV to Z_ZERO
670*     is adequate except in the face of NaNs.
671*
672*
673      DO 10 I = 1, NP
674         WORK( INH+I-1 ) = Z_ZERO
675         WORK( INV+I-1 ) = Z_ZERO
676   10 CONTINUE
677      DO 20 I = 1, NQ
678         WORK( INHT+I-1 ) = Z_ZERO
679   20 CONTINUE
680*
681*
682*
683      TOPNV = Z_ZERO
684*
685      LTLIP1 = LIJP1
686      LTNM1 = NPM1
687      IF( MYCOL.GT.MYROW ) THEN
688         LTLIP1 = LTLIP1 + 1
689         LTNM1 = LTNM1 - 1
690      END IF
691*
692*
693      DO 210 MININDEX = 1, N - 1, ANB
694*
695*
696         MAXINDEX = MIN( MININDEX+ANB-1, N )
697         LIJB = NUMROC( MAXINDEX, 1, MYCOL, 0, NPCOL ) + 1
698         LIIB = NUMROC( MAXINDEX, 1, MYROW, 0, NPROW ) + 1
699*
700         NQB = NQ - LIJB + 1
701         NPB = NP - LIIB + 1
702         INHTB = INHT + LIJB - 1
703         INVTB = INVT + LIJB - 1
704         INHB = INH + LIIB - 1
705         INVB = INV + LIIB - 1
706*
707*
708*
709*
710         DO 160 INDEX = MININDEX, MIN( MAXINDEX, N-1 )
711*
712            BINDEX = INDEX - MININDEX
713*
714            CURROW = NXTROW
715            CURCOL = NXTCOL
716*
717            NXTROW = MOD( CURROW+1, NPROW )
718            NXTCOL = MOD( CURCOL+1, NPCOL )
719*
720            LII = LIIP1
721            LIJ = LIJP1
722            NPM0 = NPM1
723*
724            IF( MYROW.EQ.CURROW ) THEN
725               NPM1 = NPM1 - 1
726               LIIP1 = LIIP1 + 1
727            END IF
728            IF( MYCOL.EQ.CURCOL ) THEN
729               NQM1 = NQM1 - 1
730               LIJP1 = LIJP1 + 1
731               LTLIP1 = LTLIP1 + 1
732               LTNM1 = LTNM1 - 1
733            END IF
734*
735*
736*
737*
738*     V = NV, VT = NVT, H = NH, HT = NHT
739*
740*
741*     Update the current column of A
742*
743*
744            IF( MYCOL.EQ.CURCOL ) THEN
745*
746               INDEXA = LII + ( LIJ-1 )*LDA
747               INDEXINV = INV + LII - 1 + ( BINDEX-1 )*LDV
748               INDEXINH = INH + LII - 1 + ( BINDEX-1 )*LDV
749               CONJTOPH = WORK( INHT+LIJ-1+BINDEX*LDV )
750               CONJTOPV = TOPNV
751*
752               IF( INDEX.GT.1 ) THEN
753                  DO 30 I = 0, NPM0 - 1
754*                  A( INDEXA+I ) = A( INDEXA+I )
755                     A( INDEXA+I ) = A( INDEXA+I ) -
756     $                               WORK( INDEXINV+LDV+I )*CONJTOPH -
757     $                               WORK( INDEXINH+LDV+I )*CONJTOPV
758   30             CONTINUE
759               END IF
760*
761*
762            END IF
763*
764*
765            IF( MYCOL.EQ.CURCOL ) THEN
766*
767*     Compute the householder vector
768*
769               IF( MYROW.EQ.CURROW ) THEN
770                  DTMP( 2 ) = A( LII+( LIJ-1 )*LDA )
771               ELSE
772                  DTMP( 2 ) = ZERO
773               END IF
774               IF( MYROW.EQ.NXTROW ) THEN
775                  DTMP( 3 ) = A( LIIP1+( LIJ-1 )*LDA )
776                  DTMP( 4 ) = ZERO
777               ELSE
778                  DTMP( 3 ) = ZERO
779                  DTMP( 4 ) = ZERO
780               END IF
781*
782               NORM = DNRM2( NPM1, A( LIIP1+( LIJ-1 )*LDA ), 1 )
783               DTMP( 1 ) = NORM
784*
785*              IF DTMP(5) = 1.0, NORM is too large and might cause
786*              overflow, hence PDTREECOMB must be called.  IF DTMP(5)
787*              is zero on output, DTMP(1) can be trusted.
788*
789               DTMP( 5 ) = ZERO
790               IF( DTMP( 1 ).GE.SAFMAX .OR. DTMP( 1 ).LT.SAFMIN ) THEN
791                  DTMP( 5 ) = ONE
792                  DTMP( 1 ) = ZERO
793               END IF
794*
795               DTMP( 1 ) = DTMP( 1 )*DTMP( 1 )
796               CALL DGSUM2D( ICTXT, 'C', ' ', 5, 1, DTMP, 5, -1,
797     $                       CURCOL )
798               IF( DTMP( 5 ).EQ.ZERO ) THEN
799                  DTMP( 1 ) = SQRT( DTMP( 1 ) )
800               ELSE
801                  DTMP( 1 ) = NORM
802                  CALL PDTREECOMB( ICTXT, 'C', 1, DTMP, -1, MYCOL,
803     $                             DCOMBNRM2 )
804               END IF
805*
806               NORM = DTMP( 1 )
807*
808               D( LIJ ) = DTMP( 2 )
809               IF( MYROW.EQ.CURROW .AND. MYCOL.EQ.CURCOL ) THEN
810                  A( LII+( LIJ-1 )*LDA ) = D( LIJ )
811               END IF
812*
813*
814               ALPHA = DTMP( 3 )
815*
816               NORM = SIGN( NORM, ALPHA )
817*
818               IF( NORM.EQ.ZERO ) THEN
819                  TOPTAU = ZERO
820               ELSE
821                  BETA = NORM + ALPHA
822                  TOPTAU = BETA / NORM
823                  ONEOVERBETA = 1.0D0 / BETA
824*
825                  CALL DSCAL( NPM1, ONEOVERBETA,
826     $                        A( LIIP1+( LIJ-1 )*LDA ), 1 )
827               END IF
828*
829               IF( MYROW.EQ.NXTROW ) THEN
830                  A( LIIP1+( LIJ-1 )*LDA ) = Z_ONE
831               END IF
832*
833               TAU( LIJ ) = TOPTAU
834               E( LIJ ) = -NORM
835*
836            END IF
837*
838*
839*     Spread v, nh, toptau across
840*
841            DO 40 I = 0, NPM1 - 1
842               WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+I ) = A( LIIP1+I+
843     $            ( LIJ-1 )*LDA )
844   40       CONTINUE
845*
846            IF( MYCOL.EQ.CURCOL ) THEN
847               WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) = TOPTAU
848               CALL DGEBS2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1,
849     $                       WORK( INV+LIIP1-1+BINDEX*LDV ),
850     $                       NPM1+NPM1+1 )
851            ELSE
852               CALL DGEBR2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1,
853     $                       WORK( INV+LIIP1-1+BINDEX*LDV ),
854     $                       NPM1+NPM1+1, MYROW, CURCOL )
855               TOPTAU = WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 )
856            END IF
857            DO 50 I = 0, NPM1 - 1
858               WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1-
859     $            1+BINDEX*LDV+NPM1+I )
860   50       CONTINUE
861*
862            IF( INDEX.LT.N ) THEN
863               IF( MYROW.EQ.NXTROW .AND. MYCOL.EQ.CURCOL )
864     $            A( LIIP1+( LIJ-1 )*LDA ) = E( LIJ )
865            END IF
866*
867*     Transpose v, nh
868*
869*
870            IF( MYROW.EQ.MYCOL ) THEN
871               DO 60 I = 0, NPM1 + NPM1
872                  WORK( INVT+LIJP1-1+BINDEX*LDV+I ) = WORK( INV+LIIP1-1+
873     $               BINDEX*LDV+I )
874   60          CONTINUE
875            ELSE
876               CALL DGESD2D( ICTXT, NPM1+NPM1, 1,
877     $                       WORK( INV+LIIP1-1+BINDEX*LDV ), NPM1+NPM1,
878     $                       MYCOL, MYROW )
879               CALL DGERV2D( ICTXT, NQM1+NQM1, 1,
880     $                       WORK( INVT+LIJP1-1+BINDEX*LDV ), NQM1+NQM1,
881     $                       MYCOL, MYROW )
882            END IF
883*
884            DO 70 I = 0, NQM1 - 1
885               WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV+I ) = WORK( INVT+
886     $            LIJP1-1+BINDEX*LDV+NQM1+I )
887   70       CONTINUE
888*
889*
890*           Update the current block column of A
891*
892            IF( INDEX.GT.1 ) THEN
893               DO 90 J = LIJP1, LIJB - 1
894                  DO 80 I = 0, NPM1 - 1
895*
896                     A( LIIP1+I+( J-1 )*LDA ) = A( LIIP1+I+( J-1 )*LDA )
897     $                   - WORK( INV+LIIP1-1+BINDEX*LDV+I )*
898     $                  WORK( INHT+J-1+BINDEX*LDV ) -
899     $                  WORK( INH+LIIP1-1+BINDEX*LDV+I )*
900     $                  WORK( INVT+J-1+BINDEX*LDV )
901   80             CONTINUE
902   90          CONTINUE
903            END IF
904*
905*
906*
907*     Compute NV = A * NHT; NVT = A * NH
908*
909*           These two lines are necessary because these elements
910*           are not always involved in the calls to DTRMVT
911*           for two reasons:
912*           1)  On diagonal processors, the call to TRMVT
913*               involves only LTNM1-1 elements
914*           2)  On some processes, NQM1 < LTM1 or  LIIP1 < LTLIP1
915*               and when the results are combined across all processes,
916*               uninitialized values may be included.
917            WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) = Z_ZERO
918            WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+NQM1-1 ) = Z_ZERO
919*
920*
921            IF( MYROW.EQ.MYCOL ) THEN
922               IF( LTNM1.GT.1 ) THEN
923                  CALL DTRMVT( 'L', LTNM1-1,
924     $                         A( LTLIP1+1+( LIJP1-1 )*LDA ), LDA,
925     $                         WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), 1,
926     $                         WORK( INH+LTLIP1+1-1+( BINDEX+1 )*LDV ),
927     $                         1, WORK( INV+LTLIP1+1-1+( BINDEX+1 )*
928     $                         LDV ), 1, WORK( INHT+LIJP1-1+( BINDEX+
929     $                         1 )*LDV ), 1 )
930               END IF
931               DO 100 I = 1, LTNM1
932                  WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV )
933     $               = WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) +
934     $               A( LTLIP1+I-1+( LIJP1+I-1-1 )*LDA )*
935     $               WORK( INH+LTLIP1+I-1-1+( BINDEX+1 )*LDV )
936  100          CONTINUE
937            ELSE
938               IF( LTNM1.GT.0 )
939     $            CALL DTRMVT( 'L', LTNM1, A( LTLIP1+( LIJP1-1 )*LDA ),
940     $                         LDA, WORK( INVT+LIJP1-1+( BINDEX+1 )*
941     $                         LDV ), 1, WORK( INH+LTLIP1-1+( BINDEX+
942     $                         1 )*LDV ), 1, WORK( INV+LTLIP1-1+
943     $                         ( BINDEX+1 )*LDV ), 1,
944     $                         WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV ),
945     $                         1 )
946*
947            END IF
948*
949*
950*     We take advantage of the fact that:
951*     A * sum( B ) = sum ( A * B ) for matrices A,B
952*
953*     trueA = A + V * HT + H * VT
954*     hence:  (trueA)v = Av' + V * HT * v + H * VT * v
955*     VT * v = sum_p_in_NPROW ( VTp * v )
956*     H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v )
957*
958*     v = v + V * HT * h + H * VT * h
959*
960*
961*
962*     tmp = HT * nh1
963            DO 110 I = 1, 2*( BINDEX+1 )
964               WORK( INTMP-1+I ) = 0
965  110       CONTINUE
966*
967            IF( BALANCED ) THEN
968               NPSET = NPROW
969               MYSETNUM = MYROW
970               ROWSPERPROC = ICEIL( NQB, NPSET )
971               MYFIRSTROW = MIN( NQB+1, 1+ROWSPERPROC*MYSETNUM )
972               NUMROWS = MIN( ROWSPERPROC, NQB-MYFIRSTROW+1 )
973*
974*
975*     tmp = HT * v
976*
977               CALL DGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE,
978     $                     WORK( INHTB+MYFIRSTROW-1 ), LDV,
979     $                     WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ),
980     $                     1, Z_ZERO, WORK( INTMP ), 1 )
981*     tmp2 = VT * v
982               CALL DGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE,
983     $                     WORK( INVTB+MYFIRSTROW-1 ), LDV,
984     $                     WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ),
985     $                     1, Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 )
986*
987*
988               CALL DGSUM2D( ICTXT, 'C', ' ', 2*( BINDEX+1 ), 1,
989     $                       WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 )
990            ELSE
991*     tmp = HT * v
992*
993               CALL DGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INHTB ),
994     $                     LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1,
995     $                     Z_ZERO, WORK( INTMP ), 1 )
996*     tmp2 = VT * v
997               CALL DGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INVTB ),
998     $                     LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1,
999     $                     Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 )
1000*
1001            END IF
1002*
1003*
1004*
1005            IF( BALANCED ) THEN
1006               MYSETNUM = MYCOL
1007*
1008               ROWSPERPROC = ICEIL( NPB, NPSET )
1009               MYFIRSTROW = MIN( NPB+1, 1+ROWSPERPROC*MYSETNUM )
1010               NUMROWS = MIN( ROWSPERPROC, NPB-MYFIRSTROW+1 )
1011*
1012               CALL DGSUM2D( ICTXT, 'R', ' ', 2*( BINDEX+1 ), 1,
1013     $                       WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 )
1014*
1015*
1016*     v = v + V * tmp
1017               IF( INDEX.GT.1. ) THEN
1018                  CALL DGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE,
1019     $                        WORK( INVB+MYFIRSTROW-1 ), LDV,
1020     $                        WORK( INTMP ), 1, Z_ONE,
1021     $                        WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )*
1022     $                        LDV ), 1 )
1023*
1024*     v = v + H * tmp2
1025                  CALL DGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE,
1026     $                        WORK( INHB+MYFIRSTROW-1 ), LDV,
1027     $                        WORK( INTMP+BINDEX+1 ), 1, Z_ONE,
1028     $                        WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )*
1029     $                        LDV ), 1 )
1030               END IF
1031*
1032            ELSE
1033*     v = v + V * tmp
1034               CALL DGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INVB ),
1035     $                     LDV, WORK( INTMP ), 1, Z_ONE,
1036     $                     WORK( INVB+( BINDEX+1 )*LDV ), 1 )
1037*
1038*
1039*     v = v + H * tmp2
1040               CALL DGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INHB ),
1041     $                     LDV, WORK( INTMP+BINDEX+1 ), 1, Z_ONE,
1042     $                     WORK( INVB+( BINDEX+1 )*LDV ), 1 )
1043*
1044            END IF
1045*
1046*
1047*     Transpose NV and add it back into NVT
1048*
1049            IF( MYROW.EQ.MYCOL ) THEN
1050               DO 120 I = 0, NQM1 - 1
1051                  WORK( INTMP+I ) = WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+
1052     $                              I )
1053  120          CONTINUE
1054            ELSE
1055               CALL DGESD2D( ICTXT, NQM1, 1,
1056     $                       WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ),
1057     $                       NQM1, MYCOL, MYROW )
1058               CALL DGERV2D( ICTXT, NPM1, 1, WORK( INTMP ), NPM1, MYCOL,
1059     $                       MYROW )
1060*
1061            END IF
1062            DO 130 I = 0, NPM1 - 1
1063               WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1-
1064     $            1+( BINDEX+1 )*LDV+I ) + WORK( INTMP+I )
1065  130       CONTINUE
1066*
1067*     Sum-to-one NV rowwise (within a row)
1068*
1069            CALL DGSUM2D( ICTXT, 'R', ' ', NPM1, 1,
1070     $                    WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ), NPM1,
1071     $                    MYROW, NXTCOL )
1072*
1073*
1074*     Dot product c = NV * NH
1075*     Sum-to-all c within next processor column
1076*
1077*
1078            IF( MYCOL.EQ.NXTCOL ) THEN
1079               CC( 1 ) = Z_ZERO
1080               DO 140 I = 0, NPM1 - 1
1081                  CC( 1 ) = CC( 1 ) + WORK( INV+LIIP1-1+( BINDEX+1 )*
1082     $                      LDV+I )*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+
1083     $                      I )
1084  140          CONTINUE
1085               IF( MYROW.EQ.NXTROW ) THEN
1086                  CC( 2 ) = WORK( INV+LIIP1-1+( BINDEX+1 )*LDV )
1087                  CC( 3 ) = WORK( INH+LIIP1-1+( BINDEX+1 )*LDV )
1088               ELSE
1089                  CC( 2 ) = Z_ZERO
1090                  CC( 3 ) = Z_ZERO
1091               END IF
1092               CALL DGSUM2D( ICTXT, 'C', ' ', 3, 1, CC, 3, -1, NXTCOL )
1093*
1094               TOPV = CC( 2 )
1095               C = CC( 1 )
1096               TOPH = CC( 3 )
1097*
1098               TOPNV = TOPTAU*( TOPV-C*TOPTAU / 2*TOPH )
1099*
1100*
1101*     Compute V = Tau * (V - C * Tau' / 2 * H )
1102*
1103*
1104               DO 150 I = 0, NPM1 - 1
1105                  WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = TOPTAU*
1106     $               ( WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I )-C*TOPTAU /
1107     $               2*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) )
1108  150          CONTINUE
1109*
1110            END IF
1111*
1112*
1113  160    CONTINUE
1114*
1115*
1116*     Perform the rank2k update
1117*
1118         IF( MAXINDEX.LT.N ) THEN
1119*
1120            DO 170 I = 0, NPM1 - 1
1121               WORK( INTMP+I ) = WORK( INH+LIIP1-1+ANB*LDV+I )
1122  170       CONTINUE
1123*
1124*
1125*
1126            IF( .NOT.TWOGEMMS ) THEN
1127               IF( INTERLEAVE ) THEN
1128                  LDZG = LDV / 2
1129               ELSE
1130                  CALL DLAMOV( 'A', LTNM1, ANB, WORK( INHT+LIJP1-1 ),
1131     $                         LDV, WORK( INVT+LIJP1-1+ANB*LDV ), LDV )
1132*
1133                  CALL DLAMOV( 'A', LTNM1, ANB, WORK( INV+LTLIP1-1 ),
1134     $                         LDV, WORK( INH+LTLIP1-1+ANB*LDV ), LDV )
1135                  LDZG = LDV
1136               END IF
1137               NBZG = ANB*2
1138            ELSE
1139               LDZG = LDV
1140               NBZG = ANB
1141            END IF
1142*
1143*
1144            DO 180 PBMIN = 1, LTNM1, PNB
1145*
1146               PBSIZE = MIN( PNB, LTNM1-PBMIN+1 )
1147               PBMAX = MIN( LTNM1, PBMIN+PNB-1 )
1148               CALL DGEMM( 'N', 'C', PBSIZE, PBMAX, NBZG, Z_NEGONE,
1149     $                     WORK( INH+LTLIP1-1+PBMIN-1 ), LDZG,
1150     $                     WORK( INVT+LIJP1-1 ), LDZG, Z_ONE,
1151     $                     A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA )
1152               IF( TWOGEMMS ) THEN
1153                  CALL DGEMM( 'N', 'C', PBSIZE, PBMAX, ANB, Z_NEGONE,
1154     $                        WORK( INV+LTLIP1-1+PBMIN-1 ), LDZG,
1155     $                        WORK( INHT+LIJP1-1 ), LDZG, Z_ONE,
1156     $                        A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA )
1157               END IF
1158  180       CONTINUE
1159*
1160*
1161*
1162            DO 190 I = 0, NPM1 - 1
1163               WORK( INV+LIIP1-1+I ) = WORK( INV+LIIP1-1+ANB*LDV+I )
1164               WORK( INH+LIIP1-1+I ) = WORK( INTMP+I )
1165  190       CONTINUE
1166            DO 200 I = 0, NQM1 - 1
1167               WORK( INHT+LIJP1-1+I ) = WORK( INHT+LIJP1-1+ANB*LDV+I )
1168  200       CONTINUE
1169*
1170*
1171         END IF
1172*
1173*     End of the update A code
1174*
1175  210 CONTINUE
1176*
1177      IF( MYCOL.EQ.NXTCOL ) THEN
1178         IF( MYROW.EQ.NXTROW ) THEN
1179*
1180            D( NQ ) = A( NP+( NQ-1 )*LDA )
1181*
1182            CALL DGEBS2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1 )
1183         ELSE
1184            CALL DGEBR2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1, NXTROW,
1185     $                    NXTCOL )
1186         END IF
1187      END IF
1188*
1189*
1190*
1191*
1192      WORK( 1 ) = DBLE( LWMIN )
1193      RETURN
1194*
1195*     End of PDSYTTRD
1196*
1197*
1198      END
1199