1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18
19 #ifdef __STDC__
pssyrk_(F_CHAR_T UPLO,F_CHAR_T TRANS,int * N,int * K,float * ALPHA,float * A,int * IA,int * JA,int * DESCA,float * BETA,float * C,int * IC,int * JC,int * DESCC)20 void pssyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K,
21 float * ALPHA,
22 float * A, int * IA, int * JA, int * DESCA,
23 float * BETA,
24 float * C, int * IC, int * JC, int * DESCC )
25 #else
26 void pssyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA,
27 C, IC, JC, DESCC )
28 /*
29 * .. Scalar Arguments ..
30 */
31 F_CHAR_T TRANS, UPLO;
32 int * IA, * IC, * JA, * JC, * K, * N;
33 float * ALPHA, * BETA;
34 /*
35 * .. Array Arguments ..
36 */
37 int * DESCA, * DESCC;
38 float * A, * C;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PSSYRK performs one of the symmetric rank k operations
46 *
47 * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
48 *
49 * or
50 *
51 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
52 *
53 * where
54 *
55 * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and,
56 *
57 * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N',
58 * A(IA:IA+K-1,JA:JA+N-1) otherwise.
59 *
60 * Alpha and beta are scalars, sub( C ) is an n by n symmetric
61 * submatrix and sub( A ) is an n by k submatrix in the first case and a
62 * k by n submatrix in the second case.
63 *
64 * Notes
65 * =====
66 *
67 * A description vector is associated with each 2D block-cyclicly dis-
68 * tributed matrix. This vector stores the information required to
69 * establish the mapping between a matrix entry and its corresponding
70 * process and memory location.
71 *
72 * In the following comments, the character _ should be read as
73 * "of the distributed matrix". Let A be a generic term for any 2D
74 * block cyclicly distributed matrix. Its description vector is DESC_A:
75 *
76 * NOTATION STORED IN EXPLANATION
77 * ---------------- --------------- ------------------------------------
78 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
79 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
80 * the NPROW x NPCOL BLACS process grid
81 * A is distributed over. The context
82 * itself is global, but the handle
83 * (the integer value) may vary.
84 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
85 * ted matrix A, M_A >= 0.
86 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
87 * buted matrix A, N_A >= 0.
88 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
89 * block of the matrix A, IMB_A > 0.
90 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
91 * left block of the matrix A,
92 * INB_A > 0.
93 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
94 * bute the last M_A-IMB_A rows of A,
95 * MB_A > 0.
96 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
97 * bute the last N_A-INB_A columns of
98 * A, NB_A > 0.
99 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
100 * row of the matrix A is distributed,
101 * NPROW > RSRC_A >= 0.
102 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
103 * first column of A is distributed.
104 * NPCOL > CSRC_A >= 0.
105 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
106 * array storing the local blocks of
107 * the distributed matrix A,
108 * IF( Lc( 1, N_A ) > 0 )
109 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
110 * ELSE
111 * LLD_A >= 1.
112 *
113 * Let K be the number of rows of a matrix A starting at the global in-
114 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
115 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
116 * receive if these K rows were distributed over NPROW processes. If K
117 * is the number of columns of a matrix A starting at the global index
118 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
119 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
120 * these K columns were distributed over NPCOL processes.
121 *
122 * The values of Lr() and Lc() may be determined via a call to the func-
123 * tion PB_Cnumroc:
124 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
125 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
126 *
127 * Arguments
128 * =========
129 *
130 * UPLO (global input) CHARACTER*1
131 * On entry, UPLO specifies whether the local pieces of
132 * the array C containing the upper or lower triangular part
133 * of the symmetric submatrix sub( C ) are to be referenced as
134 * follows:
135 *
136 * UPLO = 'U' or 'u' Only the local pieces corresponding to
137 * the upper triangular part of the
138 * symmetric submatrix sub( C ) are to be
139 * referenced,
140 *
141 * UPLO = 'L' or 'l' Only the local pieces corresponding to
142 * the lower triangular part of the
143 * symmetric submatrix sub( C ) are to be
144 * referenced.
145 *
146 * TRANS (global input) CHARACTER*1
147 * On entry, TRANS specifies the operation to be performed as
148 * follows:
149 *
150 * TRANS = 'N' or 'n'
151 * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
152 *
153 * TRANS = 'T' or 't'
154 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ).
155 *
156 * TRANS = 'C' or 'c'
157 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ).
158 *
159 * N (global input) INTEGER
160 * On entry, N specifies the order of the submatrix sub( C ).
161 * N must be at least zero.
162 *
163 * K (global input) INTEGER
164 * On entry, with TRANS = 'N' or 'n', K specifies the number of
165 * columns of the submatrix sub( A ), and with TRANS = 'T' or
166 * 't' or 'C' or 'c', K specifies the number of rows of the sub-
167 * matrix sub( A ). K must be at least zero.
168 *
169 * ALPHA (global input) REAL
170 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
171 * supplied as zero then the local entries of the array A
172 * corresponding to the entries of the submatrix sub( A ) need
173 * not be set on input.
174 *
175 * A (local input) REAL array
176 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
177 * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at
178 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
179 * contains the local entries of the matrix A.
180 * Before entry with TRANS = 'N' or 'n', this array contains the
181 * local entries corresponding to the entries of the n by k sub-
182 * matrix sub( A ), otherwise the local entries corresponding to
183 * the entries of the k by n submatrix sub( A ).
184 *
185 * IA (global input) INTEGER
186 * On entry, IA specifies A's global row index, which points to
187 * the beginning of the submatrix sub( A ).
188 *
189 * JA (global input) INTEGER
190 * On entry, JA specifies A's global column index, which points
191 * to the beginning of the submatrix sub( A ).
192 *
193 * DESCA (global and local input) INTEGER array
194 * On entry, DESCA is an integer array of dimension DLEN_. This
195 * is the array descriptor for the matrix A.
196 *
197 * BETA (global input) REAL
198 * On entry, BETA specifies the scalar beta. When BETA is
199 * supplied as zero then the local entries of the array C
200 * corresponding to the entries of the submatrix sub( C ) need
201 * not be set on input.
202 *
203 * C (local input/local output) REAL array
204 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
205 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
206 * the local entries of the matrix C.
207 * Before entry with UPLO = 'U' or 'u', this array contains
208 * the local entries corresponding to the upper triangular part
209 * of the symmetric submatrix sub( C ), and the local entries
210 * corresponding to the strictly lower triangular of sub( C )
211 * are not referenced. On exit, the upper triangular part of
212 * sub( C ) is overwritten by the upper triangular part of the
213 * updated submatrix.
214 * Before entry with UPLO = 'L' or 'l', this array contains
215 * the local entries corresponding to the lower triangular part
216 * of the symmetric submatrix sub( C ), and the local entries
217 * corresponding to the strictly upper triangular of sub( C )
218 * are not referenced. On exit, the lower triangular part of
219 * sub( C ) is overwritten by the lower triangular part of the
220 * updated submatrix.
221 *
222 * IC (global input) INTEGER
223 * On entry, IC specifies C's global row index, which points to
224 * the beginning of the submatrix sub( C ).
225 *
226 * JC (global input) INTEGER
227 * On entry, JC specifies C's global column index, which points
228 * to the beginning of the submatrix sub( C ).
229 *
230 * DESCC (global and local input) INTEGER array
231 * On entry, DESCC is an integer array of dimension DLEN_. This
232 * is the array descriptor for the matrix C.
233 *
234 * -- Written on April 1, 1998 by
235 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
236 *
237 * ---------------------------------------------------------------------
238 */
239 /*
240 * .. Local Scalars ..
241 */
242 char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave,
243 rtop, rtopsave;
244 int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol,
245 myrow, nb, notran, npcol, nprow, upper;
246 double Aest, ACest, tmp1, tmp2, tmp3, tmp4;
247 PBTYP_T * type;
248 /*
249 * .. Local Arrays ..
250 */
251 int Ad[DLEN_], Cd[DLEN_];
252 /* ..
253 * .. Executable Statements ..
254 *
255 */
256 upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
257 notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
258 PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
259 PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd );
260 #ifndef NO_ARGCHK
261 /*
262 * Test the input parameters
263 */
264 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
265 if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) )
266 {
267 if( ( !upper ) && ( UploC != CLOWER ) )
268 {
269 PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal UPLO = %c\n", UploC );
270 info = -1;
271 }
272 else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
273 {
274 PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal TRANS = %c\n", TranOp );
275 info = -2;
276 }
277 if( notran )
278 PB_Cchkmat( ctxt, "PSSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9,
279 &info );
280 else
281 PB_Cchkmat( ctxt, "PSSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9,
282 &info );
283 PB_Cchkmat( ctxt, "PSSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14,
284 &info );
285 }
286 if( info ) { PB_Cabort( ctxt, "PSSYRK", info ); return; }
287 #endif
288 /*
289 * Quick return if possible
290 */
291 if( ( *N == 0 ) ||
292 ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) &&
293 ( BETA[REAL_PART] == ONE ) ) )
294 return;
295 /*
296 * Get type structure
297 */
298 type = PB_Cstypeset();
299 /*
300 * And when alpha or K is zero
301 */
302 if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) )
303 {
304 if( BETA[REAL_PART] == ZERO )
305 {
306 PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero,
307 ((char *) C), Ci, Cj, Cd );
308 }
309 else
310 {
311 PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA),
312 ((char *) C), Ci, Cj, Cd );
313 }
314 return;
315 }
316 /*
317 * Start the operations
318 */
319 #ifdef NO_ARGCHK
320 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
321 #endif
322 /*
323 * Algorithm selection is based on approximation of the communication volume
324 * for distributed and aligned operands.
325 *
326 * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N)
327 * Aest : only sub( A ) is communicated (N >> K)
328 */
329 if( notran )
330 {
331 tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol );
332 ACest = (double)(*N) *
333 ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) +
334 ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO :
335 CBRATIO * tmp1 / TWO ) );
336 tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
337 tmp4 = DNROC( *N, Ad[MB_], nprow );
338 Aest = (double)(*K) *
339 ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) +
340 ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) );
341 }
342 else
343 {
344 tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow );
345 ACest = (double)(*N) *
346 ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) +
347 ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO :
348 CBRATIO * tmp2 / TWO ) );
349 tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol );
350 tmp3 = DNROC( *N, Ad[NB_], npcol );
351 Aest = (double)(*K) *
352 ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) +
353 ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) );
354 }
355 /*
356 * Shift a little the cross-over point between both algorithms.
357 */
358 ChooseAC = ( ( 1.3 * ACest ) <= Aest );
359 /*
360 * BLACS topologies are enforced iff N and K are strictly greater than the
361 * logical block size returned by pilaenv_. Otherwise, it is assumed that the
362 * routine calling this routine has already selected an adequate topology.
363 */
364 nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) );
365 ForceTop = ( ( *N > nb ) && ( *K > nb ) );
366
367 if( ChooseAC )
368 {
369 if( notran )
370 {
371 OpC = CBCAST;
372 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
373
374 if( ForceTop )
375 {
376 OpR = CCOMBINE;
377 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET );
378
379 rtopsave = rtop;
380 ctopsave = ctop;
381
382 if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
383 else { TopR = CTOP_DRING; TopC = CTOP_IRING; }
384
385 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
386 rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR );
387 /*
388 * Remove the next line when the BLACS combine operations support ring
389 * topologies
390 */
391 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT );
392 }
393
394 DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
395 }
396 else
397 {
398 OpR = CBCAST;
399 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET );
400
401 if( ForceTop )
402 {
403 OpC = CCOMBINE;
404 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
405
406 rtopsave = rtop;
407 ctopsave = ctop;
408
409 if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; }
410 else { TopR = CTOP_DRING; TopC = CTOP_IRING; }
411
412 rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR );
413 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC );
414 /*
415 * Remove the next line when the BLACS combine operations support ring
416 * topologies
417 */
418 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT );
419 }
420 DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
421 }
422
423 PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N,
424 *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
425 ((char *)C), Ci, Cj, Cd );
426 }
427 else
428 {
429 if( notran )
430 {
431 OpR = CBCAST;
432 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET );
433
434 if( ForceTop )
435 {
436 OpC = CBCAST;
437 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
438
439 rtopsave = rtop;
440 ctopsave = ctop;
441 /*
442 * No clear winner for the ring topologies, so that if a ring topology is
443 * already selected, keep it.
444 */
445 if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
446 ( rtop != CTOP_SRING ) )
447 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING );
448 if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
449 ( ctop != CTOP_SRING ) )
450 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
451 }
452
453 DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD );
454 }
455 else
456 {
457 OpC = CBCAST;
458 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET );
459
460 if( ForceTop )
461 {
462 OpR = CBCAST;
463 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET );
464
465 rtopsave = rtop;
466 ctopsave = ctop;
467 /*
468 * No clear winner for the ring topologies, so that if a ring topology is
469 * already selected, keep it.
470 */
471 if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) &&
472 ( rtop != CTOP_SRING ) )
473 rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING );
474 if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) &&
475 ( ctop != CTOP_SRING ) )
476 ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING );
477 }
478
479 DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD );
480 }
481
482 PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N,
483 *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA),
484 ((char *)C), Ci, Cj, Cd );
485 }
486 /*
487 * Restore the BLACS topologies when necessary.
488 */
489 if( ForceTop )
490 {
491 rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave );
492 ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave );
493 }
494 /*
495 * End of PSSYRK
496 */
497 }
498