1 /* ---------------------------------------------------------------------
2 *
3 *  -- PBLAS routine (version 2.0) --
4 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 *     and University of California, Berkeley.
6 *     April 1, 1998
7 *
8 *  ---------------------------------------------------------------------
9 */
10 /*
11 *  Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
pssyrk_(F_CHAR_T UPLO,F_CHAR_T TRANS,int * N,int * K,float * ALPHA,float * A,int * IA,int * JA,int * DESCA,float * BETA,float * C,int * IC,int * JC,int * DESCC)20 void pssyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K,
21               float * ALPHA,
22               float * A, int * IA, int * JA, int * DESCA,
23               float * BETA,
24               float * C, int * IC, int * JC, int * DESCC )
25 #else
26 void pssyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA,
27               C, IC, JC, DESCC )
28 /*
29 *  .. Scalar Arguments ..
30 */
31    F_CHAR_T       TRANS, UPLO;
32    int            * IA, * IC, * JA, * JC, * K, * N;
33    float          * ALPHA, * BETA;
34 /*
35 *  .. Array Arguments ..
36 */
37    int            * DESCA, * DESCC;
38    float          * A, * C;
39 #endif
40 {
41 /*
42 *  Purpose
43 *  =======
44 *
45 *  PSSYRK  performs one of the symmetric rank k operations
46 *
47 *     sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
48 *
49 *  or
50 *
51 *     sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
52 *
53 *  where
54 *
55 *     sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and,
56 *
57 *     sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1)  if TRANS = 'N',
58 *                      A(IA:IA+K-1,JA:JA+N-1)  otherwise.
59 *
60 *  Alpha  and   beta  are  scalars,  sub( C )  is  an  n by n  symmetric
61 *  submatrix and sub( A ) is an n by k submatrix in the first case and a
62 *  k by n submatrix in the second case.
63 *
64 *  Notes
65 *  =====
66 *
67 *  A description  vector  is associated with each 2D block-cyclicly dis-
68 *  tributed matrix.  This  vector  stores  the  information  required to
69 *  establish the  mapping  between a  matrix entry and its corresponding
70 *  process and memory location.
71 *
72 *  In  the  following  comments,   the character _  should  be  read  as
73 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
74 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
75 *
76 *  NOTATION         STORED IN       EXPLANATION
77 *  ---------------- --------------- ------------------------------------
78 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
79 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
80 *                                   the NPROW x NPCOL BLACS process grid
81 *                                   A  is  distributed over. The context
82 *                                   itself  is  global,  but  the handle
83 *                                   (the integer value) may vary.
84 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
85 *                                   ted matrix A, M_A >= 0.
86 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
87 *                                   buted matrix A, N_A >= 0.
88 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
89 *                                   block of the matrix A, IMB_A > 0.
90 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
91 *                                   left   block   of   the  matrix   A,
92 *                                   INB_A > 0.
93 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
94 *                                   bute the last  M_A-IMB_A  rows of A,
95 *                                   MB_A > 0.
96 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
97 *                                   bute the last  N_A-INB_A  columns of
98 *                                   A, NB_A > 0.
99 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
100 *                                   row of the matrix  A is distributed,
101 *                                   NPROW > RSRC_A >= 0.
102 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
103 *                                   first column of  A  is  distributed.
104 *                                   NPCOL > CSRC_A >= 0.
105 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
106 *                                   array  storing  the  local blocks of
107 *                                   the distributed matrix A,
108 *                                   IF( Lc( 1, N_A ) > 0 )
109 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
110 *                                   ELSE
111 *                                      LLD_A >= 1.
112 *
113 *  Let K be the number of  rows of a matrix A starting at the global in-
114 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
115 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
116 *  receive if these K rows were distributed over NPROW processes.  If  K
117 *  is the number of columns of a matrix  A  starting at the global index
118 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
119 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
120 *  these K columns were distributed over NPCOL processes.
121 *
122 *  The values of Lr() and Lc() may be determined via a call to the func-
123 *  tion PB_Cnumroc:
124 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
125 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
126 *
127 *  Arguments
128 *  =========
129 *
130 *  UPLO    (global input) CHARACTER*1
131 *          On  entry,   UPLO  specifies  whether  the  local  pieces  of
132 *          the array  C  containing the  upper or lower triangular  part
133 *          of the symmetric submatrix  sub( C )  are to be referenced as
134 *          follows:
135 *
136 *             UPLO = 'U' or 'u'   Only the local pieces corresponding to
137 *                                 the   upper  triangular  part  of  the
138 *                                 symmetric submatrix sub( C ) are to be
139 *                                 referenced,
140 *
141 *             UPLO = 'L' or 'l'   Only the local pieces corresponding to
142 *                                 the   lower  triangular  part  of  the
143 *                                 symmetric submatrix sub( C ) are to be
144 *                                 referenced.
145 *
146 *  TRANS   (global input) CHARACTER*1
147 *          On entry,  TRANS  specifies the  operation to be performed as
148 *          follows:
149 *
150 *             TRANS = 'N' or 'n'
151 *                  sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
152 *
153 *             TRANS = 'T' or 't'
154 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ).
155 *
156 *             TRANS = 'C' or 'c'
157 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ).
158 *
159 *  N       (global input) INTEGER
160 *          On entry,  N specifies the order of the  submatrix  sub( C ).
161 *          N must be at least zero.
162 *
163 *  K       (global input) INTEGER
164 *          On entry, with TRANS = 'N' or 'n',  K specifies the number of
165 *          columns  of  the submatrix  sub( A ), and with TRANS = 'T' or
166 *          't' or 'C' or 'c', K specifies the number of rows of the sub-
167 *          matrix sub( A ). K  must  be at least zero.
168 *
169 *  ALPHA   (global input) REAL
170 *          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
171 *          supplied  as  zero  then  the  local entries of  the array  A
172 *          corresponding to the entries of the submatrix  sub( A )  need
173 *          not be set on input.
174 *
175 *  A       (local input) REAL array
176 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
177 *          at least  Lc( 1, JA+K-1 ) when  TRANS = 'N' or 'n', and is at
178 *          least Lc( 1, JA+N-1 ) otherwise.  Before  entry,  this  array
179 *          contains the local entries of the matrix A.
180 *          Before entry with TRANS = 'N' or 'n', this array contains the
181 *          local entries corresponding to the entries of the n by k sub-
182 *          matrix sub( A ), otherwise the local entries corresponding to
183 *          the entries of the k by n submatrix sub( A ).
184 *
185 *  IA      (global input) INTEGER
186 *          On entry, IA  specifies A's global row index, which points to
187 *          the beginning of the submatrix sub( A ).
188 *
189 *  JA      (global input) INTEGER
190 *          On entry, JA  specifies A's global column index, which points
191 *          to the beginning of the submatrix sub( A ).
192 *
193 *  DESCA   (global and local input) INTEGER array
194 *          On entry, DESCA  is an integer array of dimension DLEN_. This
195 *          is the array descriptor for the matrix A.
196 *
197 *  BETA    (global input) REAL
198 *          On entry,  BETA  specifies the scalar  beta.   When  BETA  is
199 *          supplied  as  zero  then  the  local entries of  the array  C
200 *          corresponding to the entries of the submatrix  sub( C )  need
201 *          not be set on input.
202 *
203 *  C       (local input/local output) REAL array
204 *          On entry, C is an array of dimension (LLD_C, Kc), where Kc is
205 *          at least Lc( 1, JC+N-1 ).  Before  entry, this array contains
206 *          the local entries of the matrix C.
207 *          Before  entry  with  UPLO = 'U' or 'u', this  array  contains
208 *          the local entries corresponding to the upper triangular  part
209 *          of the  symmetric  submatrix  sub( C ), and the local entries
210 *          corresponding to the  strictly lower triangular  of  sub( C )
211 *          are not  referenced.  On exit,  the upper triangular part  of
212 *          sub( C ) is overwritten by the  upper triangular part  of the
213 *          updated submatrix.
214 *          Before  entry  with  UPLO = 'L' or 'l', this  array  contains
215 *          the local entries corresponding to the lower triangular  part
216 *          of the  symmetric  submatrix  sub( C ), and the local entries
217 *          corresponding to the  strictly upper triangular  of  sub( C )
218 *          are not  referenced.  On exit,  the lower triangular part  of
219 *          sub( C ) is overwritten by the  lower triangular part  of the
220 *          updated submatrix.
221 *
222 *  IC      (global input) INTEGER
223 *          On entry, IC  specifies C's global row index, which points to
224 *          the beginning of the submatrix sub( C ).
225 *
226 *  JC      (global input) INTEGER
227 *          On entry, JC  specifies C's global column index, which points
228 *          to the beginning of the submatrix sub( C ).
229 *
230 *  DESCC   (global and local input) INTEGER array
231 *          On entry, DESCC  is an integer array of dimension DLEN_. This
232 *          is the array descriptor for the matrix C.
233 *
234 *  -- Written on April 1, 1998 by
235 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
236 *
237 *  ---------------------------------------------------------------------
238 */
239 /*
240 *  .. Local Scalars ..
241 */
242    char           DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave,
243                   rtop, rtopsave;
244    int            Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol,
245                   myrow, nb, notran, npcol, nprow, upper;
246    double         Aest, ACest, tmp1, tmp2, tmp3, tmp4;
247    PBTYP_T        * type;
248 /*
249 *  .. Local Arrays ..
250 */
251    int            Ad[DLEN_], Cd[DLEN_];
252 /* ..
253 *  .. Executable Statements ..
254 *
255 */
256    upper  = ( ( UploC  = Mupcase( F2C_CHAR( UPLO  )[0] ) ) ==  CUPPER );
257    notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
258    PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
259    PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd );
260 #ifndef NO_ARGCHK
261 /*
262 *  Test the input parameters
263 */
264    Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
265    if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) )
266    {
267       if( ( !upper ) && ( UploC != CLOWER ) )
268       {
269          PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal UPLO = %c\n", UploC );
270          info = -1;
271       }
272       else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
273       {
274          PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal TRANS = %c\n", TranOp );
275          info = -2;
276       }
277       if( notran )
278          PB_Cchkmat( ctxt, "PSSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad,  9,
279                      &info );
280       else
281          PB_Cchkmat( ctxt, "PSSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad,  9,
282                      &info );
283       PB_Cchkmat(    ctxt, "PSSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14,
284                      &info );
285    }
286    if( info ) { PB_Cabort( ctxt, "PSSYRK", info ); return; }
287 #endif
288 /*
289 *  Quick return if possible
290 */
291    if( ( *N == 0 ) ||
292        ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) &&
293          ( BETA[REAL_PART] == ONE                      ) ) )
294       return;
295 /*
296 *  Get type structure
297 */
298    type = PB_Cstypeset();
299 /*
300 *  And when alpha or K is zero
301 */
302    if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) )
303    {
304       if( BETA[REAL_PART] == ZERO )
305       {
306          PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero,
307                      ((char *) C), Ci, Cj, Cd );
308       }
309       else
310       {
311          PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA),
312                       ((char *) C), Ci, Cj, Cd );
313       }
314       return;
315    }
316 /*
317 *  Start the operations
318 */
319 #ifdef NO_ARGCHK
320    Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
321 #endif
322 /*
323 *  Algorithm selection is based on approximation of the communication volume
324 *  for distributed and aligned operands.
325 *
326 *  ACest: both operands sub( A ) and sub( C ) are communicated (K >> N)
327 *  Aest : only sub( A ) is communicated                        (N >> K)
328 */
329    if( notran )
330    {
331       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol );
332       ACest = (double)(*N) *
333         ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) +
334           ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO :
335             CBRATIO * tmp1 / TWO ) );
336       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
337                                            tmp4 = DNROC( *N, Ad[MB_], nprow );
338       Aest  = (double)(*K) *
339               ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) +
340                 ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) );
341    }
342    else
343    {
344       tmp2  = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow );
345       ACest = (double)(*N) *
346         ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) +
347           ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO :
348             CBRATIO * tmp2 / TWO ) );
349       tmp1  = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
350       tmp3  = DNROC( *N, Ad[NB_], npcol );
351       Aest  = (double)(*K) *
352               ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) +
353                 ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) );
354    }
355 /*
356 *  Shift a little the cross-over point between both algorithms.
357 */
358    ChooseAC = ( ( 1.3 * ACest ) <= Aest );
359 /*
360 *  BLACS topologies are enforced iff N and K are strictly greater than the
361 *  logical block size returned by pilaenv_. Otherwise, it is assumed that the
362 *  routine calling this routine has already selected an adequate topology.
363 */
364    nb       = pilaenv_( &ctxt, C2F_CHAR( &type->type ) );
365    ForceTop = ( ( *N > nb ) && ( *K > nb ) );
366 
367    if( ChooseAC )
368    {
369       if( notran )
370       {
371          OpC  = CBCAST;
372          ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
373 
374          if( ForceTop )
375          {
376             OpR  = CCOMBINE;
377             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
378 
379             rtopsave = rtop;
380             ctopsave = ctop;
381 
382             if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
383             else        { TopR = CTOP_DRING; TopC = CTOP_IRING; }
384 
385             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
386             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    &TopR );
387 /*
388 *  Remove the next line when the BLACS combine operations support ring
389 *  topologies
390 */
391             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_DEFAULT );
392          }
393 
394          DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
395       }
396       else
397       {
398          OpR  = CBCAST;
399          rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
400 
401          if( ForceTop )
402          {
403             OpC  = CCOMBINE;
404             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
405 
406             rtopsave = rtop;
407             ctopsave = ctop;
408 
409             if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
410             else        { TopR = CTOP_DRING; TopC = CTOP_IRING; }
411 
412             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    &TopR );
413             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
414 /*
415 *  Remove the next line when the BLACS combine operations support ring
416 *  topologies
417 */
418             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT );
419          }
420          DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
421       }
422 
423       PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N,
424                    *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
425                    ((char *)C), Ci, Cj, Cd );
426    }
427    else
428    {
429       if( notran )
430       {
431          OpR  = CBCAST;
432          rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
433 
434          if( ForceTop )
435          {
436             OpC  = CBCAST;
437             ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
438 
439             rtopsave = rtop;
440             ctopsave = ctop;
441 /*
442 *  No clear winner for the ring topologies, so that if a ring topology is
443 *  already selected, keep it.
444 */
445             if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
446                 ( rtop != CTOP_SRING ) )
447                rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_SRING );
448             if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
449                 ( ctop != CTOP_SRING ) )
450                ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
451          }
452 
453          DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
454       }
455       else
456       {
457          OpC  = CBCAST;
458          ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
459 
460          if( ForceTop )
461          {
462             OpR  = CBCAST;
463             rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_GET );
464 
465             rtopsave = rtop;
466             ctopsave = ctop;
467 /*
468 *  No clear winner for the ring topologies, so that if a ring topology is
469 *  already selected, keep it.
470 */
471             if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
472                 ( rtop != CTOP_SRING ) )
473                rtop = *PB_Ctop( &ctxt, &OpR, ROW,    TOP_SRING );
474             if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
475                 ( ctop != CTOP_SRING ) )
476                ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
477          }
478 
479          DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
480       }
481 
482       PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ),  *N,
483                   *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
484                   ((char *)C), Ci, Cj, Cd );
485    }
486 /*
487 *  Restore the BLACS topologies when necessary.
488 */
489    if( ForceTop )
490    {
491       rtopsave = *PB_Ctop( &ctxt, &OpR, ROW,    &rtopsave );
492       ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave );
493    }
494 /*
495 *  End of PSSYRK
496 */
497 }
498