1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18
19 #ifdef __STDC__
PB_Cplascal(PBTYP_T * TYPE,char * UPLO,char * CONJUG,int M,int N,char * ALPHA,char * A,int IA,int JA,int * DESCA)20 void PB_Cplascal( PBTYP_T * TYPE, char * UPLO, char * CONJUG, int M,
21 int N, char * ALPHA, char * A, int IA, int JA,
22 int * DESCA )
23 #else
24 void PB_Cplascal( TYPE, UPLO, CONJUG, M, N, ALPHA, A, IA, JA, DESCA )
25 /*
26 * .. Scalar Arguments ..
27 */
28 char * CONJUG, * UPLO;
29 int IA, JA, M, N;
30 char * ALPHA;
31 PBTYP_T * TYPE;
32 /*
33 * .. Array Arguments ..
34 */
35 int * DESCA;
36 char * A;
37 #endif
38 {
39 /*
40 * Purpose
41 * =======
42 *
43 * PB_Cplascal scales by alpha an m by n submatrix sub( A ) denoting
44 * A(IA:IA+M-1,JA:JA+N-1).
45 *
46 * Notes
47 * =====
48 *
49 * A description vector is associated with each 2D block-cyclicly dis-
50 * tributed matrix. This vector stores the information required to
51 * establish the mapping between a matrix entry and its corresponding
52 * process and memory location.
53 *
54 * In the following comments, the character _ should be read as
55 * "of the distributed matrix". Let A be a generic term for any 2D
56 * block cyclicly distributed matrix. Its description vector is DESC_A:
57 *
58 * NOTATION STORED IN EXPLANATION
59 * ---------------- --------------- ------------------------------------
60 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
61 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
62 * the NPROW x NPCOL BLACS process grid
63 * A is distributed over. The context
64 * itself is global, but the handle
65 * (the integer value) may vary.
66 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
67 * ted matrix A, M_A >= 0.
68 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
69 * buted matrix A, N_A >= 0.
70 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
71 * block of the matrix A, IMB_A > 0.
72 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
73 * left block of the matrix A,
74 * INB_A > 0.
75 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
76 * bute the last M_A-IMB_A rows of A,
77 * MB_A > 0.
78 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
79 * bute the last N_A-INB_A columns of
80 * A, NB_A > 0.
81 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
82 * row of the matrix A is distributed,
83 * NPROW > RSRC_A >= 0.
84 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
85 * first column of A is distributed.
86 * NPCOL > CSRC_A >= 0.
87 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
88 * array storing the local blocks of
89 * the distributed matrix A,
90 * IF( Lc( 1, N_A ) > 0 )
91 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
92 * ELSE
93 * LLD_A >= 1.
94 *
95 * Let K be the number of rows of a matrix A starting at the global in-
96 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
97 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
98 * receive if these K rows were distributed over NPROW processes. If K
99 * is the number of columns of a matrix A starting at the global index
100 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
101 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
102 * these K columns were distributed over NPCOL processes.
103 *
104 * The values of Lr() and Lc() may be determined via a call to the func-
105 * tion PB_Cnumroc:
106 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
107 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
108 *
109 * Arguments
110 * =========
111 *
112 * TYPE (local input) pointer to a PBTYP_T structure
113 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
114 * that contains type information (See pblas.h).
115 *
116 * UPLO (global input) pointer to CHAR
117 * On entry, UPLO specifies the part of the submatrix sub( A )
118 * to be scaled as follows:
119 * = 'L' or 'l': Lower triangular part is scaled; the
120 * strictly upper triangular part of sub( A ) is not changed;
121 * = 'U' or 'u': Upper triangular part is scaled; the
122 * strictly lower triangular part of sub( A ) is not changed;
123 * Otherwise: All of the submatrix sub( A ) is scaled.
124 *
125 * CONJUG (global input) pointer to CHAR
126 * On entry, CONJUG specifies what kind of scaling should be
127 * done as follows: when UPLO is 'L', 'l', 'U' or 'u' and CONJUG
128 * is 'Z' or 'z', alpha is assumed to be real and the imaginary
129 * part of the diagonals are set to zero. Otherwise, alpha is of
130 * the same type as the entries of sub( A ) and nothing particu-
131 * lar is done to the diagonals of sub( A ).
132 *
133 * M (global input) INTEGER
134 * On entry, M specifies the number of rows of the submatrix
135 * sub( A ). M must be at least zero.
136 *
137 * N (global input) INTEGER
138 * On entry, N specifies the number of columns of the submatrix
139 * sub( A ). N must be at least zero.
140 *
141 * ALPHA (global input) pointer to CHAR
142 * On entry, ALPHA specifies the scalar alpha, i.e., the cons-
143 * tant with which the matrix elements are to be scaled.
144 *
145 * A (local input/local output) pointer to CHAR
146 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
147 * at least Lc( 1, JA+N-1 ). Before entry, this array contains
148 * the local entries of the matrix A to be scaled. On exit, the
149 * local entries of this array corresponding to the to the en-
150 * tries of the submatrix sub( A ) are overwritten by the local
151 * entries of the m by n scaled submatrix.
152 *
153 * IA (global input) INTEGER
154 * On entry, IA specifies A's global row index, which points to
155 * the beginning of the submatrix sub( A ).
156 *
157 * JA (global input) INTEGER
158 * On entry, JA specifies A's global column index, which points
159 * to the beginning of the submatrix sub( A ).
160 *
161 * DESCA (global and local input) INTEGER array
162 * On entry, DESCA is an integer array of dimension DLEN_. This
163 * is the array descriptor for the matrix A.
164 *
165 * -- Written on April 1, 1998 by
166 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
167 *
168 * ---------------------------------------------------------------------
169 */
170 /*
171 * .. Local Scalars ..
172 */
173 char UploA, herm, type;
174 int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Ald, Amb, Amp,
175 Amp0, Anb, Anq, Anq0, ctxt, izero=0, k, kb, ktmp, mn, mycol,
176 myrow, nb, npcol, nprow, size;
177 TZSCAL_T scal;
178 /*
179 * .. Local Arrays ..
180 */
181 int Ad0[DLEN_];
182 char * Aptr = NULL;
183 /* ..
184 * .. Executable Statements ..
185 *
186 */
187 /*
188 * Quick return if possible
189 */
190 if( ( M <= 0 ) || ( N <= 0 ) ) return;
191 /*
192 * If alpha is zero, then call PB_Cplapad instead.
193 */
194 type = TYPE->type;
195 UploA = Mupcase( UPLO[0] );
196 herm = ( UploA == CALL ? CNOCONJG : Mupcase( CONJUG[0] ) );
197
198 if( type == SREAL )
199 {
200 if( ((float*)(ALPHA))[REAL_PART] == ZERO )
201 {
202 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA,
203 JA, DESCA );
204 return;
205 }
206 else if( ((float*)(ALPHA))[REAL_PART] == ONE ) return;
207 }
208 else if( type == DREAL )
209 {
210 if( ((double*)(ALPHA))[REAL_PART] == ZERO )
211 {
212 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA,
213 JA, DESCA );
214 return;
215 }
216 else if( ((double*)(ALPHA))[REAL_PART] == ONE ) return;
217 }
218 else if( type == SCPLX )
219 {
220 if( herm == CCONJG )
221 {
222 if( ((float*)(ALPHA))[REAL_PART] == ZERO )
223 {
224 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A,
225 IA, JA, DESCA );
226 return;
227 }
228 }
229 else
230 {
231 if( ((float*)(ALPHA))[IMAG_PART] == ZERO )
232 {
233 if( ((float*)(ALPHA))[REAL_PART] == ZERO )
234 {
235 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A,
236 IA, JA, DESCA );
237 return;
238 }
239 else if( ((float*)(ALPHA))[REAL_PART] == ONE ) return;
240 }
241 }
242 }
243 else if( type == DCPLX )
244 {
245 if( herm == CCONJG )
246 {
247 if( ((double*)(ALPHA))[REAL_PART] == ZERO )
248 {
249 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A,
250 IA, JA, DESCA );
251 return;
252 }
253 }
254 else
255 {
256 if( ((double*)(ALPHA))[IMAG_PART] == ZERO )
257 {
258 if( ((double*)(ALPHA))[REAL_PART] == ZERO )
259 {
260 PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A,
261 IA, JA, DESCA );
262 return;
263 }
264 else if( ((double*)(ALPHA))[REAL_PART] == ONE ) return;
265 }
266 }
267 }
268 /*
269 * Retrieve process grid information
270 */
271 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
272 /*
273 * Compute descriptor Ad0 for sub( A )
274 */
275 PB_Cdescribe( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
276 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
277 /*
278 * Quick return if I don't own any of sub( A ).
279 */
280 Amp = PB_Cnumroc( M, 0, Aimb1, Amb, myrow, Arow, nprow );
281 Anq = PB_Cnumroc( N, 0, Ainb1, Anb, mycol, Acol, npcol );
282 if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return;
283
284 size = TYPE->size;
285 scal = ( herm == CCONJG ? TYPE->Fhescal : TYPE->Ftzscal );
286 Aptr = Mptr( A, Aii, Ajj, Ald, size );
287 /*
288 * When the entire sub( A ) needs to be scaled or when sub( A ) is replicated in
289 * all processes, just call the local routine.
290 */
291 if( ( Mupcase( UPLO[0] ) == CALL ) ||
292 ( ( ( Arow < 0 ) || ( nprow == 1 ) ) &&
293 ( ( Acol < 0 ) || ( npcol == 1 ) ) ) )
294 {
295 scal( C2F_CHAR( UPLO ), &Amp, &Anq, &izero, ALPHA, Aptr, &Ald );
296 return;
297 }
298 /*
299 * Computational partitioning size is computed as the product of the logical
300 * value returned by pilaenv_ and two times the least common multiple of nprow
301 * and npcol.
302 */
303 nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type ) ) *
304 PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
305
306 mn = MIN( M, N );
307
308 if( Mupcase( UPLO[0] ) == CLOWER )
309 {
310 /*
311 * Lower triangle of sub( A ): proceed by block of columns. For each block of
312 * columns, operate on the logical diagonal block first and then the remaining
313 * rows of that block of columns.
314 */
315 for( k = 0; k < mn; k += nb )
316 {
317 kb = mn - k; ktmp = k + ( kb = MIN( kb, nb ) );
318 PB_Cplasca2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, Aptr, k, k, Ad0 );
319 Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow );
320 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
321 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
322 if( ( Amp0 = Amp - Akp ) > 0 )
323 scal( C2F_CHAR( ALL ), &Amp0, &Anq0, &izero, ALPHA, Mptr( Aptr,
324 Akp, Akq, Ald, size ), &Ald );
325 }
326 }
327 else if( Mupcase( UPLO[0] ) == CUPPER )
328 {
329 /*
330 * Upper triangle of sub( A ): proceed by block of columns. For each block of
331 * columns, operate on the trailing rows and then the logical diagonal block
332 * of that block of columns. When M < N, the last columns of sub( A ) are
333 * handled together.
334 */
335 for( k = 0; k < mn; k += nb )
336 {
337 kb = mn - k; kb = MIN( kb, nb );
338 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
339 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
340 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
341 if( Akp > 0 )
342 scal( C2F_CHAR( ALL ), &Akp, &Anq0, &izero, ALPHA, Mptr( Aptr,
343 0, Akq, Ald, size ), &Ald );
344 PB_Cplasca2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, Aptr, k, k, Ad0 );
345 }
346 if( ( Anq -= ( Akq += Anq0 ) ) > 0 )
347 scal( C2F_CHAR( ALL ), &Amp, &Anq, &izero, ALPHA, Mptr( Aptr, 0,
348 Akq, Ald, size ), &Ald );
349 }
350 else
351 {
352 /*
353 * All of sub( A ): proceed by block of columns. For each block of columns,
354 * operate on the trailing rows, then the logical diagonal block, and finally
355 * the remaining rows of that block of columns. When M < N, the last columns
356 * of sub( A ) are handled together.
357 */
358 for( k = 0; k < mn; k += nb )
359 {
360 kb = mn - k; kb = MIN( kb, nb );
361 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
362 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
363 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
364 if( Akp > 0 )
365 scal( C2F_CHAR( ALL ), &Akp, &Anq0, &izero, ALPHA, Mptr( Aptr,
366 0, Akq, Ald, size ), &Ald );
367 PB_Cplasca2( TYPE, UPLO, NOCONJG, kb, kb, ALPHA, Aptr, k, k, Ad0 );
368 Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
369 if( ( Amp0 = Amp - Akp ) > 0 )
370 scal( C2F_CHAR( ALL ), &Amp0, &Anq0, &izero, ALPHA, Mptr( Aptr,
371 Akp, Akq, Ald, size ), &Ald );
372 }
373 if( ( Anq -= ( Akq += Anq0 ) ) > 0 )
374 scal( C2F_CHAR( ALL ), &Amp, &Anq, &izero, ALPHA, Mptr( Aptr, 0,
375 Akq, Ald, size ), &Ald );
376 }
377 /*
378 * End of PB_Cplascal
379 */
380 }
381