1      SUBROUTINE PSLAUUM( UPLO, N, A, IA, JA, DESCA )
2*
3*  -- ScaLAPACK auxiliary routine (version 1.7) --
4*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5*     and University of California, Berkeley.
6*     May 1, 1997
7*
8*     .. Scalar Arguments ..
9      CHARACTER          UPLO
10      INTEGER            IA, JA, N
11*     ..
12*     .. Array Arguments ..
13      INTEGER            DESCA( * )
14      REAL               A( * )
15*     ..
16*
17*  Purpose
18*  =======
19*
20*  PSLAUUM computes the product U * U' or L' * L, where the triangular
21*  factor U or L is stored in the upper or lower triangular part of
22*  the distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
23*
24*  If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
25*  overwriting the factor U in sub( A ).
26*  If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
27*  overwriting the factor L in sub( A ).
28*
29*  This is the blocked form of the algorithm, calling Level 3 PBLAS.
30*
31*  Notes
32*  =====
33*
34*  Each global data object is described by an associated description
35*  vector.  This vector stores the information required to establish
36*  the mapping between an object element and its corresponding process
37*  and memory location.
38*
39*  Let A be a generic term for any 2D block cyclicly distributed array.
40*  Such a global array has an associated description vector DESCA.
41*  In the following comments, the character _ should be read as
42*  "of the global array".
43*
44*  NOTATION        STORED IN      EXPLANATION
45*  --------------- -------------- --------------------------------------
46*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
47*                                 DTYPE_A = 1.
48*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
49*                                 the BLACS process grid A is distribu-
50*                                 ted over. The context itself is glo-
51*                                 bal, but the handle (the integer
52*                                 value) may vary.
53*  M_A    (global) DESCA( M_ )    The number of rows in the global
54*                                 array A.
55*  N_A    (global) DESCA( N_ )    The number of columns in the global
56*                                 array A.
57*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
58*                                 the rows of the array.
59*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
60*                                 the columns of the array.
61*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
62*                                 row of the array A is distributed.
63*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
64*                                 first column of the array A is
65*                                 distributed.
66*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
67*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
68*
69*  Let K be the number of rows or columns of a distributed matrix,
70*  and assume that its process grid has dimension p x q.
71*  LOCr( K ) denotes the number of elements of K that a process
72*  would receive if K were distributed over the p processes of its
73*  process column.
74*  Similarly, LOCc( K ) denotes the number of elements of K that a
75*  process would receive if K were distributed over the q processes of
76*  its process row.
77*  The values of LOCr() and LOCc() may be determined via a call to the
78*  ScaLAPACK tool function, NUMROC:
79*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
80*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
81*  An upper bound for these quantities may be computed by:
82*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
83*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
84*
85*  Arguments
86*  =========
87*
88*  UPLO    (global input) CHARACTER*1
89*          Specifies whether the triangular factor stored in the
90*          distributed matrix sub( A ) is upper or lower triangular:
91*          = 'U':  Upper triangular
92*          = 'L':  Lower triangular
93*
94*  N       (global input) INTEGER
95*          The number of rows and columns to be operated on, i.e. the
96*          order of the triangular factor U or L. N >= 0.
97*
98*  A       (local input/local output) REAL pointer into the
99*          local memory to an array of dimension (LLD_A, LOCc(JA+N-1)).
100*          On entry, the local pieces of the triangular factor L or U.
101*          On exit, if UPLO = 'U', the upper triangle of the distributed
102*          matrix sub( A ) is overwritten with the upper triangle of the
103*          product U * U'; if UPLO = 'L', the lower triangle of sub( A )
104*          is overwritten with the lower triangle of the product L' * L.
105*
106*  IA      (global input) INTEGER
107*          The row index in the global array A indicating the first
108*          row of sub( A ).
109*
110*  JA      (global input) INTEGER
111*          The column index in the global array A indicating the
112*          first column of sub( A ).
113*
114*  DESCA   (global and local input) INTEGER array of dimension DLEN_.
115*          The array descriptor for the distributed matrix A.
116*
117*  =====================================================================
118*
119*     .. Parameters ..
120      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
121     $                   LLD_, MB_, M_, NB_, N_, RSRC_
122      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
123     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
124     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
125      REAL               ONE
126      PARAMETER          ( ONE = 1.0E+0 )
127*     ..
128*     .. Local Scalars ..
129      INTEGER            I, J, JB, JN
130*     ..
131*     .. External Subroutines ..
132      EXTERNAL           PSGEMM, PSLAUU2, PSTRMM, PSSYRK
133*     ..
134*     .. External Functions ..
135      LOGICAL            LSAME
136      INTEGER            ICEIL
137      EXTERNAL           ICEIL, LSAME
138*     ..
139*     .. Intrinsic Functions ..
140      INTRINSIC          MIN
141*     ..
142*     .. Executable Statements ..
143*
144*     Quick return if possible
145*
146      IF( N.EQ.0 )
147     $   RETURN
148*
149      JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 )
150      IF(  LSAME( UPLO, 'U' ) ) THEN
151*
152*        Compute the product U * U'.
153*
154*        Handle first block separately
155*
156         JB = JN-JA+1
157         CALL PSLAUU2( 'Upper', JB, A, IA, JA, DESCA )
158         IF( JB.LE.N-1 ) THEN
159            CALL PSSYRK( 'Upper', 'No transpose', JB, N-JB, ONE, A, IA,
160     $                   JA+JB, DESCA, ONE, A, IA, JA, DESCA )
161         END IF
162*
163*        Loop over remaining block of columns
164*
165         DO 10 J = JN+1, JA+N-1, DESCA( NB_ )
166            JB = MIN( N-J+JA, DESCA( NB_ ) )
167            I = IA + J - JA
168            CALL PSTRMM( 'Right', 'Upper', 'Transpose',  'Non-unit',
169     $                   J-JA, JB, ONE, A, I, J, DESCA, A, IA, J,
170     $                   DESCA )
171            CALL PSLAUU2( 'Upper', JB, A, I, J, DESCA )
172            IF( J+JB.LE.JA+N-1 ) THEN
173               CALL PSGEMM( 'No transpose', 'Transpose', J-JA, JB,
174     $                      N-J-JB+JA, ONE, A, IA, J+JB, DESCA, A, I,
175     $                      J+JB, DESCA, ONE, A, IA, J, DESCA )
176               CALL PSSYRK( 'Upper', 'No transpose', JB, N-J-JB+JA, ONE,
177     $                      A, I, J+JB, DESCA, ONE, A, I, J, DESCA )
178            END IF
179   10    CONTINUE
180      ELSE
181*
182*        Compute the product L' * L.
183*
184*        Handle first block separately
185*
186         JB = JN-JA+1
187         CALL PSLAUU2( 'Lower', JB, A, IA, JA, DESCA )
188         IF( JB.LE.N-1 ) THEN
189            CALL PSSYRK( 'Lower', 'Transpose', JB, N-JB, ONE, A, IA+JB,
190     $                   JA, DESCA, ONE, A, IA, JA, DESCA )
191         END IF
192*
193*        Loop over remaining block of columns
194*
195         DO 20 J = JN+1, JA+N-1, DESCA( NB_ )
196            JB = MIN( N-J+JA, DESCA( NB_ ) )
197            I = IA + J - JA
198            CALL PSTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', JB,
199     $                   J-JA, ONE, A, I, J, DESCA, A, I, JA, DESCA )
200            CALL PSLAUU2( 'Lower', JB, A, I, J, DESCA )
201            IF( J+JB.LE.JA+N-1 ) THEN
202               CALL PSGEMM( 'Transpose', 'No transpose', JB, J-JA,
203     $                      N-J-JB+JA, ONE, A, I+JB, J, DESCA, A, I+JB,
204     $                      JA, DESCA, ONE, A, I, JA, DESCA )
205               CALL PSSYRK( 'Lower', 'Transpose', JB, N-J-JB+JA, ONE,
206     $                      A, I+JB, J, DESCA, ONE, A, I, J, DESCA )
207            END IF
208   20    CONTINUE
209      END IF
210*
211      RETURN
212*
213*     End of PSLAUUM
214*
215      END
216