1 /* ---------------------------------------------------------------------
2 *
3 *  -- PBLAS routine (version 2.0) --
4 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 *     and University of California, Berkeley.
6 *     April 1, 1998
7 *
8 *  ---------------------------------------------------------------------
9 */
10 /*
11 *  Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
pzsyrk_(F_CHAR_T UPLO,F_CHAR_T TRANS,int * N,int * K,double * ALPHA,double * A,int * IA,int * JA,int * DESCA,double * BETA,double * C,int * IC,int * JC,int * DESCC)20 void pzsyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K,
21               double * ALPHA,
22               double * A, int * IA, int * JA, int * DESCA,
23               double * BETA,
24               double * C, int * IC, int * JC, int * DESCC )
25 #else
26 void pzsyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA,
27               C, IC, JC, DESCC )
28 /*
29 *  .. Scalar Arguments ..
30 */
31    F_CHAR_T       TRANS, UPLO;
32    int            * IA, * IC, * JA, * JC, * K, * N;
33    double         * ALPHA, * BETA;
34 /*
35 *  .. Array Arguments ..
36 */
37    int            * DESCA, * DESCC;
38    double         * A, * C;
39 #endif
40 {
41 /*
42 *  Purpose
43 *  =======
44 *
45 *  PZSYRK  performs one of the symmetric rank k operations
46 *
47 *     sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
48 *
49 *  or
50 *
51 *     sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
52 *
53 *  where
54 *
55 *     sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and,
56 *
57 *     sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1)  if TRANS = 'N',
58 *                      A(IA:IA+K-1,JA:JA+N-1)  otherwise.
59 *
60 *  Alpha  and   beta  are  scalars,  sub( C )  is  an  n by n  symmetric
61 *  submatrix and sub( A ) is an n by k submatrix in the first case and a
62 *  k by n submatrix in the second case.
63 *
64 *  Notes
65 *  =====
66 *
67 *  A description  vector  is associated with each 2D block-cyclicly dis-
68 *  tributed matrix.  This  vector  stores  the  information  required to
69 *  establish the  mapping  between a  matrix entry and its corresponding
70 *  process and memory location.
71 *
72 *  In  the  following  comments,   the character _  should  be  read  as
73 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
74 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
75 *
76 *  NOTATION         STORED IN       EXPLANATION
77 *  ---------------- --------------- ------------------------------------
78 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
79 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
80 *                                   the NPROW x NPCOL BLACS process grid
81 *                                   A  is  distributed over. The context
82 *                                   itself  is  global,  but  the handle
83 *                                   (the integer value) may vary.
84 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
85 *                                   ted matrix A, M_A >= 0.
86 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
87 *                                   buted matrix A, N_A >= 0.
88 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
89 *                                   block of the matrix A, IMB_A > 0.
90 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
91 *                                   left   block   of   the  matrix   A,
92 *                                   INB_A > 0.
93 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
94 *                                   bute the last  M_A-IMB_A  rows of A,
95 *                                   MB_A > 0.
96 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
97 *                                   bute the last  N_A-INB_A  columns of
98 *                                   A, NB_A > 0.
99 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
100 *                                   row of the matrix  A is distributed,
101 *                                   NPROW > RSRC_A >= 0.
102 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
103 *                                   first column of  A  is  distributed.
104 *                                   NPCOL > CSRC_A >= 0.
105 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
106 *                                   array  storing  the  local blocks of
107 *                                   the distributed matrix A,
108 *                                   IF( Lc( 1, N_A ) > 0 )
109 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
110 *                                   ELSE
111 *                                      LLD_A >= 1.
112 *
113 *  Let K be the number of  rows of a matrix A starting at the global in-
114 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
115 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
116 *  receive if these K rows were distributed over NPROW processes.  If  K
117 *  is the number of columns of a matrix  A  starting at the global index
118 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
119 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
120 *  these K columns were distributed over NPCOL processes.
121 *
122 *  The values of Lr() and Lc() may be determined via a call to the func-
123 *  tion PB_Cnumroc:
124 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
125 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
126 *
127 *  Arguments
128 *  =========
129 *
130 *  UPLO    (global input) CHARACTER*1
131 *          On  entry,   UPLO  specifies  whether  the  local  pieces  of
132 *          the array  C  containing the  upper or lower triangular  part
133 *          of the symmetric submatrix  sub( C )  are to be referenced as
134 *          follows:
135 *
136 *             UPLO = 'U' or 'u'   Only the local pieces corresponding to
137 *                                 the   upper  triangular  part  of  the
138 *                                 symmetric submatrix sub( C ) are to be
139 *                                 referenced,
140 *
141 *             UPLO = 'L' or 'l'   Only the local pieces corresponding to
142 *                                 the   lower  triangular  part  of  the
143 *                                 symmetric submatrix sub( C ) are to be
144 *                                 referenced.
145 *
146 *  TRANS   (global input) CHARACTER*1
147 *          On entry,  TRANS  specifies the  operation to be performed as
148 *          follows:
149 *
150 *             TRANS = 'N' or 'n'
151 *                  sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
152 *
153 *             TRANS = 'T' or 't'
154 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ).
155 *
156 *  N       (global input) INTEGER
157 *          On entry,  N specifies the order of the  submatrix  sub( C ).
158 *          N must be at least zero.
159 *
160 *  K       (global input) INTEGER
161 *          On entry, with TRANS = 'N' or 'n',  K specifies the number of
162 *          columns  of  the submatrix  sub( A ), and with TRANS = 'T' or
163 *          't',  K  specifies  the  number  of  rows  of  the  submatrix
164 *          sub( A ). K  must be at least zero.
165 *
166 *  ALPHA   (global input) COMPLEX*16
167 *          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
168 *          supplied  as  zero  then  the  local entries of  the array  A
169 *          corresponding to the entries of the submatrix  sub( A )  need
170 *          not be set on input.
171 *
172 *  A       (local input) COMPLEX*16 array
173 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
174 *          at least  Lc( 1, JA+K-1 ) when  TRANS = 'N' or 'n', and is at
175 *          least Lc( 1, JA+N-1 ) otherwise.  Before  entry,  this  array
176 *          contains the local entries of the matrix A.
177 *          Before entry with TRANS = 'N' or 'n', this array contains the
178 *          local entries corresponding to the entries of the n by k sub-
179 *          matrix sub( A ), otherwise the local entries corresponding to
180 *          the entries of the k by n submatrix sub( A ).
181 *
182 *  IA      (global input) INTEGER
183 *          On entry, IA  specifies A's global row index, which points to
184 *          the beginning of the submatrix sub( A ).
185 *
186 *  JA      (global input) INTEGER
187 *          On entry, JA  specifies A's global column index, which points
188 *          to the beginning of the submatrix sub( A ).
189 *
190 *  DESCA   (global and local input) INTEGER array
191 *          On entry, DESCA  is an integer array of dimension DLEN_. This
192 *          is the array descriptor for the matrix A.
193 *
194 *  BETA    (global input) COMPLEX*16
195 *          On entry,  BETA  specifies the scalar  beta.   When  BETA  is
196 *          supplied  as  zero  then  the  local entries of  the array  C
197 *          corresponding to the entries of the submatrix  sub( C )  need
198 *          not be set on input.
199 *
200 *  C       (local input/local output) COMPLEX*16 array
201 *          On entry, C is an array of dimension (LLD_C, Kc), where Kc is
202 *          at least Lc( 1, JC+N-1 ).  Before  entry, this array contains
203 *          the local entries of the matrix C.
204 *          Before  entry  with  UPLO = 'U' or 'u', this  array  contains
205 *          the local entries corresponding to the upper triangular  part
206 *          of the  symmetric  submatrix  sub( C ), and the local entries
207 *          corresponding to the  strictly lower triangular  of  sub( C )
208 *          are not  referenced.  On exit,  the upper triangular part  of
209 *          sub( C ) is overwritten by the  upper triangular part  of the
210 *          updated submatrix.
211 *          Before  entry  with  UPLO = 'L' or 'l', this  array  contains
212 *          the local entries corresponding to the lower triangular  part
213 *          of the  symmetric  submatrix  sub( C ), and the local entries
214 *          corresponding to the  strictly upper triangular  of  sub( C )
215 *          are not  referenced.  On exit,  the lower triangular part  of
216 *          sub( C ) is overwritten by the  lower triangular part  of the
217 *          updated submatrix.
218 *
219 *  IC      (global input) INTEGER
220 *          On entry, IC  specifies C's global row index, which points to
221 *          the beginning of the submatrix sub( C ).
222 *
223 *  JC      (global input) INTEGER
224 *          On entry, JC  specifies C's global column index, which points
225 *          to the beginning of the submatrix sub( C ).
226 *
227 *  DESCC   (global and local input) INTEGER array
228 *          On entry, DESCC  is an integer array of dimension DLEN_. This
229 *          is the array descriptor for the matrix C.
230 *
231 *  -- Written on April 1, 1998 by
232 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
233 *
234 *  ---------------------------------------------------------------------
235 */
236 /*
237 *  .. Local Scalars ..
238 */
239    char           DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave,
240                   rtop, rtopsave;
241    int            Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol,
242                   myrow, nb, notran, npcol, nprow, upper;
243    double         Aest, ACest, tmp1, tmp2, tmp3, tmp4;
244    PBTYP_T        * type;
245 /*
246 *  .. Local Arrays ..
247 */
248    int            Ad[DLEN_], Cd[DLEN_];
249 /* ..
250 *  .. Executable Statements ..
251 *
252 */
253    upper  = ( ( UploC  = Mupcase( F2C_CHAR( UPLO  )[0] ) ) ==  CUPPER );
254    notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
255    PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
256    PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd );
257 #ifndef NO_ARGCHK
258 /*
259 *  Test the input parameters
260 */
261    Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
262    if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) )
263    {
264       if( ( !upper ) && ( UploC != CLOWER ) )
265       {
266          PB_Cwarn( ctxt, __LINE__, "PZSYRK", "Illegal UPLO = %c\n", UploC );
267          info = -1;
268       }
269       else if( ( !notran ) && ( TranOp != CTRAN ) )
270       {
271          PB_Cwarn( ctxt, __LINE__, "PZSYRK", "Illegal TRANS = %c\n", TranOp );
272          info = -2;
273       }
274       if( notran )
275          PB_Cchkmat( ctxt, "PZSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad,  9,
276                      &info );
277       else
278          PB_Cchkmat( ctxt, "PZSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad,  9,
279                      &info );
280       PB_Cchkmat(    ctxt, "PZSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14,
281                      &info );
282    }
283    if( info ) { PB_Cabort( ctxt, "PZSYRK", info ); return; }
284 #endif
285 /*
286 *  Quick return if possible
287 */
288    if( ( *N == 0 ) ||
289        ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ||
290          ( *K == 0                                                        ) ) &&
291          ( ( BETA[REAL_PART] ==  ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) )
292       return;
293 /*
294 *  Get type structure
295 */
296    type = PB_Cztypeset();
297 /*
298 *  And when alpha or K is zero
299 */
300    if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ||
301        ( *K == 0 ) )
302    {
303       if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) )
304       {
305          PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero,
306                      ((char *) C), Ci, Cj, Cd );
307       }
308       else
309       {
310          PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA),
311                       ((char *) C), Ci, Cj, Cd );
312       }
313       return;
314    }
315 /*
316 *  Start the operations
317 */
318 #ifdef NO_ARGCHK
319    Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
320 #endif
321 /*
322 *  Algorithm selection is based on approximation of the communication volume
323 *  for distributed and aligned operands.
324 *
325 *  ACest: both operands sub( A ) and sub( C ) are communicated (K >> N)
326 *  Aest : only sub( A ) is communicated                        (N >> K)
327 */
328    if( notran )
329    {
330       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol );
331       ACest = (double)(*N) *
332         ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) +
333           ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO :
334             CBRATIO * tmp1 / TWO ) );
335       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
336                                            tmp4 = DNROC( *N, Ad[MB_], nprow );
337       Aest  = (double)(*K) *
338               ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) +
339                 ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) );
340    }
341    else
342    {
343       tmp2  = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow );
344       ACest = (double)(*N) *
345         ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) +
346           ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO :
347             CBRATIO * tmp2 / TWO ) );
348       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
349       tmp3  = DNROC( *N, Ad[NB_], npcol );
350       Aest  = (double)(*K) *
351               ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) +
352                 ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) );
353    }
354 /*
355 *  Shift a little the cross-over point between both algorithms.
356 */
357    ChooseAC = ( ( 1.3 * ACest ) <= Aest );
358 /*
359 *  BLACS topologies are enforced iff N and K are strictly greater than the
360 *  logical block size returned by pilaenv_. Otherwise, it is assumed that the
361 *  routine calling this routine has already selected an adequate topology.
362 */
363    nb       = pilaenv_( &ctxt, C2F_CHAR( &type->type ) );
364    ForceTop = ( ( *N > nb ) && ( *K > nb ) );
365 
366    if( ChooseAC )
367    {
368       if( notran )
369       {
370          OpC  = CBCAST;
371          ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
372 
373          if( ForceTop )
374          {
375             OpR  = CCOMBINE;
376             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
377 
378             rtopsave = rtop;
379             ctopsave = ctop;
380 
381             if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
382             else        { TopR = CTOP_DRING; TopC = CTOP_IRING; }
383 
384             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
385             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    &TopR );
386 /*
387 *  Remove the next line when the BLACS combine operations support ring
388 *  topologies
389 */
390             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_DEFAULT );
391          }
392 
393          DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
394       }
395       else
396       {
397          OpR  = CBCAST;
398          rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
399 
400          if( ForceTop )
401          {
402             OpC  = CCOMBINE;
403             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
404 
405             rtopsave = rtop;
406             ctopsave = ctop;
407 
408             if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
409             else        { TopR = CTOP_DRING; TopC = CTOP_IRING; }
410 
411             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    &TopR );
412             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
413 /*
414 *  Remove the next line when the BLACS combine operations support ring
415 *  topologies
416 */
417             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT );
418          }
419          DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
420       }
421 
422       PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N,
423                    *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
424                    ((char *)C), Ci, Cj, Cd );
425    }
426    else
427    {
428       if( notran )
429       {
430          OpR  = CBCAST;
431          rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
432 
433          if( ForceTop )
434          {
435             OpC  = CBCAST;
436             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
437 
438             rtopsave = rtop;
439             ctopsave = ctop;
440 /*
441 *  No clear winner for the ring topologies, so that if a ring topology is
442 *  already selected, keep it.
443 */
444             if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
445                 ( rtop != CTOP_SRING ) )
446                rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_SRING );
447             if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
448                 ( ctop != CTOP_SRING ) )
449                ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
450          }
451 
452          DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
453       }
454       else
455       {
456          OpC  = CBCAST;
457          ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
458 
459          if( ForceTop )
460          {
461             OpR  = CBCAST;
462             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
463 
464             rtopsave = rtop;
465             ctopsave = ctop;
466 /*
467 *  No clear winner for the ring topologies, so that if a ring topology is
468 *  already selected, keep it.
469 */
470             if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
471                 ( rtop != CTOP_SRING ) )
472                rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_SRING );
473             if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
474                 ( ctop != CTOP_SRING ) )
475                ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
476          }
477 
478          DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
479       }
480 
481       PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ),  *N,
482                   *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
483                   ((char *)C), Ci, Cj, Cd );
484    }
485 /*
486 *  Restore the BLACS topologies when necessary.
487 */
488    if( ForceTop )
489    {
490       rtopsave = *PB_Ctop( &ctxt, &OpR, ROW,    &rtopsave );
491       ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave );
492    }
493 /*
494 *  End of PZSYRK
495 */
496 }
497