1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18
19 #ifdef __STDC__
PB_Cainfog2l(int M,int N,int I,int J,int * DESC,int NPROW,int NPCOL,int MYROW,int MYCOL,int * IMB1,int * INB1,int * MP,int * NQ,int * II,int * JJ,int * PROW,int * PCOL,int * RPROW,int * RPCOL)20 void PB_Cainfog2l( int M, int N, int I, int J, int * DESC, int NPROW,
21 int NPCOL, int MYROW, int MYCOL, int * IMB1,
22 int * INB1, int * MP, int * NQ, int * II, int * JJ,
23 int * PROW, int * PCOL, int * RPROW, int * RPCOL )
24 #else
25 void PB_Cainfog2l( M, N, I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, IMB1,
26 INB1, MP, NQ, II, JJ, PROW, PCOL, RPROW, RPCOL )
27 /*
28 * .. Scalar Arguments ..
29 */
30 int I, * II, * IMB1, * INB1, J, * JJ, M, * MP, MYCOL,
31 MYROW, N, NPCOL, NPROW, * NQ, * PCOL, * PROW, * RPCOL,
32 * RPROW;
33 /*
34 * .. Array Arguments ..
35 */
36 int * DESC;
37 #endif
38 {
39 /*
40 * Purpose
41 * =======
42 *
43 * PB_Cainfog2l computes the starting local row and column indexes II,
44 * JJ corresponding to the submatrix starting globally at the entry
45 * pointed by I, J. This routine returns the coordinates in the grid of
46 * the process owning the matrix entry of global indexes I, J, namely
47 * PROW and PCOL. In addition, this routine computes the quantities MP
48 * and NQ, which are respectively the local number of rows and columns
49 * owned by the process of coordinate MYROW, MYCOL corresponding to the
50 * global submatrix A(I:I+M-1,J:J+N-1). Finally, the size of the first
51 * partial block and the relative process coordinates are also returned
52 * respectively in IMB, INB and RPROW, RPCOL.
53 *
54 * Notes
55 * =====
56 *
57 * A description vector is associated with each 2D block-cyclicly dis-
58 * tributed matrix. This vector stores the information required to
59 * establish the mapping between a matrix entry and its corresponding
60 * process and memory location.
61 *
62 * In the following comments, the character _ should be read as
63 * "of the distributed matrix". Let A be a generic term for any 2D
64 * block cyclicly distributed matrix. Its description vector is DESC_A:
65 *
66 * NOTATION STORED IN EXPLANATION
67 * ---------------- --------------- ------------------------------------
68 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70 * the NPROW x NPCOL BLACS process grid
71 * A is distributed over. The context
72 * itself is global, but the handle
73 * (the integer value) may vary.
74 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75 * ted matrix A, M_A >= 0.
76 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
77 * buted matrix A, N_A >= 0.
78 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79 * block of the matrix A, IMB_A > 0.
80 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81 * left block of the matrix A,
82 * INB_A > 0.
83 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84 * bute the last M_A-IMB_A rows of A,
85 * MB_A > 0.
86 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87 * bute the last N_A-INB_A columns of
88 * A, NB_A > 0.
89 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90 * row of the matrix A is distributed,
91 * NPROW > RSRC_A >= 0.
92 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93 * first column of A is distributed.
94 * NPCOL > CSRC_A >= 0.
95 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96 * array storing the local blocks of
97 * the distributed matrix A,
98 * IF( Lc( 1, N_A ) > 0 )
99 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
100 * ELSE
101 * LLD_A >= 1.
102 *
103 * Let K be the number of rows of a matrix A starting at the global in-
104 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106 * receive if these K rows were distributed over NPROW processes. If K
107 * is the number of columns of a matrix A starting at the global index
108 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110 * these K columns were distributed over NPCOL processes.
111 *
112 * The values of Lr() and Lc() may be determined via a call to the func-
113 * tion PB_Cnumroc:
114 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116 *
117 * Arguments
118 * =========
119 *
120 * M (global input) INTEGER
121 * On entry, M specifies the global number of rows of the subma-
122 * trix. M must be at least zero.
123 *
124 * N (global input) INTEGER
125 * On entry, N specifies the global number of columns of the
126 * submatrix. N must be at least zero.
127 *
128 * I (global input) INTEGER
129 * On entry, I specifies the global starting row index of the
130 * submatrix. I must at least zero.
131 *
132 * J (global input) INTEGER
133 * On entry, J specifies the global starting column index of
134 * the submatrix. J must at least zero.
135 *
136 * DESC (global and local input) INTEGER array
137 * On entry, DESC is an integer array of dimension DLEN_. This
138 * is the array descriptor of the underlying matrix.
139 *
140 * NPROW (global input) INTEGER
141 * On entry, NPROW specifies the total number of process rows
142 * over which the matrix is distributed. NPROW must be at least
143 * one.
144 *
145 * NPCOL (global input) INTEGER
146 * On entry, NPCOL specifies the total number of process columns
147 * over which the matrix is distributed. NPCOL must be at least
148 * one.
149 *
150 * MYROW (local input) INTEGER
151 * On entry, MYROW specifies the row coordinate of the process
152 * whose local index II is determined. MYROW must be at least
153 * zero and strictly less than NPROW.
154 *
155 * MYCOL (local input) INTEGER
156 * On entry, MYCOL specifies the column coordinate of the pro-
157 * cess whose local index JJ is determined. MYCOL must be at
158 * least zero and strictly less than NPCOL.
159 *
160 * IMB1 (global output) INTEGER
161 * On exit, IMB1 specifies the number of rows of the upper left
162 * block of the submatrix. On exit, IMB1 is less or equal than
163 * M and greater or equal than MIN( 1, M ).
164 *
165 * INB1 (global output) INTEGER
166 * On exit, INB1 specifies the number of columns of the upper
167 * left block of the submatrix. On exit, INB1 is less or equal
168 * than N and greater or equal than MIN( 1, N ).
169 *
170 * MP (local output) INTEGER
171 * On exit, MP specifies the local number of rows of the subma-
172 * trix, that the processes of row coordinate MYROW own. MP is
173 * at least zero.
174 *
175 * NQ (local output) INTEGER
176 * On exit, NQ specifies the local number of columns of the
177 * submatrix, that the processes of column coordinate MYCOL
178 * own. NQ is at least zero.
179 *
180 * II (local output) INTEGER
181 * On exit, II specifies the local starting row index of the
182 * submatrix. On exit, II is at least zero.
183 *
184 * JJ (local output) INTEGER
185 * On exit, JJ specifies the local starting column index of
186 * the submatrix. On exit, II is at least zero.
187 *
188 * PROW (global output) INTEGER
189 * On exit, PROW specifies the row coordinate of the process
190 * that possesses the first row of the submatrix. On exit, PROW
191 * is -1 if DESC(RSRC_) is -1 on input, and, at least zero and
192 * strictly less than NPROW otherwise.
193 *
194 * PCOL (global output) INTEGER
195 * On exit, PCOL specifies the column coordinate of the process
196 * that possesses the first column of the submatrix. On exit,
197 * PCOL is -1 if DESC(CSRC_) is -1 on input, and, at least zero
198 * and strictly less than NPCOL otherwise.
199 *
200 * RPROW (global output) INTEGER
201 * On exit, RPROW specifies the relative row coordinate of the
202 * process that possesses the first row I of the submatrix. On
203 * exit, RPROW is -1 if DESC(RSRC_) is -1 on input, and, at
204 * least zero and strictly less than NPROW otherwise.
205 *
206 * RPCOL (global output) INTEGER
207 * On exit, RPCOL specifies the relative column coordinate of
208 * the process that possesses the first column J of the subma-
209 * trix. On exit, RPCOL is -1 if DESC(CSRC_) is -1 on input,
210 * and, at least zero and strictly less than NPCOL otherwise.
211 *
212 * -- Written on April 1, 1998 by
213 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
214 *
215 * ---------------------------------------------------------------------
216 */
217 /*
218 * .. Local Scalars ..
219 */
220 int i1, ilocblk, j1, mb, mydist, nb, nblocks, csrc, rsrc;
221 /* ..
222 * .. Executable Statements ..
223 *
224 */
225 /*
226 * Retrieve the row distribution parameters
227 */
228 mb = DESC[ MB_ ];
229 rsrc = DESC[ RSRC_ ];
230
231 if( ( rsrc == -1 ) || ( NPROW == 1 ) )
232 {
233 /*
234 * The rows are not distributed, or there is just one process row in the grid.
235 * Therefore, the local and global indexes are the same, as well as the local
236 * and global number of rows. Finally, the relative row process coordinate is
237 * zero, since every process owns all rows. Note that the size of the first
238 * row block can be zero only if M is zero.
239 */
240 *II = I;
241 if( ( *IMB1 = DESC[IMB_] - I ) <= 0 )
242 *IMB1 += ( ( -(*IMB1) ) / mb + 1 ) * mb;
243 *IMB1 = MIN( *IMB1, M );
244 *MP = M;
245 *PROW = rsrc;
246 *RPROW = 0;
247 }
248 else
249 {
250 /*
251 * Figure out PROW, II and IMB1 first.
252 */
253 *IMB1 = DESC[IMB_];
254 if( I < *IMB1 ) /* Is I in first block range ? */
255 {
256 /*
257 * If I is in the first block of rows, then PROW is simply rsrc, II is I in
258 * this process and zero elsewhere, and the size of the first block is the
259 * IMB complement.
260 */
261 *PROW = rsrc;
262 *II = ( ( MYROW == *PROW ) ? I : 0 );
263 *IMB1 -= I;
264 }
265 else
266 {
267 /*
268 * The discussion goes as follows: compute my distance from the source process
269 * so that within this process coordinate system, the source row process is the
270 * process such that mydist=0, or equivalently MYROW == rsrc.
271 *
272 * Find out the global coordinate of the block of rows I belongs to (nblocks),
273 * as well as the minimum local number of row blocks that every process has.
274 *
275 * when mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks,
276 * when mydist > nblocks - ilocblk * NPROW, I own ilocblk full blocks,
277 * when mydist = nblocks - ilocblk * NPROW, I own ilocblk full blocks
278 * but not I, or I own ilocblk + 1 blocks and the entry I refers to.
279 */
280 i1 = I - *IMB1;
281 if( MYROW == rsrc )
282 {
283 /*
284 * I refers to an entry that is not in the first block, find out which process
285 * has it.
286 */
287 nblocks = i1 / mb + 1;
288 *PROW = rsrc + nblocks;
289 *PROW -= ( *PROW / NPROW ) * NPROW;
290 /*
291 * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only three
292 * possible cases:
293 *
294 * 1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I don't own I, in
295 * which case II = IMB + ( ilocblk - 1 ) * MB. Note that this case cannot
296 * happen when ilocblk is zero, since nblocks is at least one.
297 *
298 * 2) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in which
299 * case I and II can respectively be written as IMB + (nblocks-1)*MB + IL
300 * and IMB+(ilocblk-1) * MB + IL. That is II = I + (ilocblk - nblocks)*MB.
301 * Note that this case cannot happen when ilocblk is zero, since nblocks
302 * is at least one.
303 *
304 * 3) mydist = 0 < nblocks - ilocblk * NPROW, the source process owns
305 * ilocblk+1 full blocks, and therefore II = IMB + ilocblk * MB. Note
306 * that when ilocblk is zero, II is just IMB.
307 */
308 if( nblocks < NPROW )
309 {
310 *II = *IMB1;
311 }
312 else
313 {
314 ilocblk = nblocks / NPROW;
315 if( ilocblk * NPROW >= nblocks )
316 {
317 *II = ( ( MYROW == *PROW ) ? I + ( ilocblk - nblocks ) * mb :
318 *IMB1 + ( ilocblk - 1 ) * mb );
319 }
320 else
321 {
322 *II = *IMB1 + ilocblk * mb;
323 }
324 }
325 }
326 else
327 {
328 /*
329 * I is not in the first block, find out which process has it.
330 */
331 nblocks = i1 / mb + 1;
332 *PROW = rsrc + nblocks;
333 *PROW -= ( *PROW / NPROW ) * NPROW;
334 /*
335 * Compute my distance from the source process so that within this process
336 * coordinate system, the source process is the process such that mydist=0.
337 */
338 if( ( mydist = MYROW - rsrc ) < 0 ) mydist += NPROW;
339 /*
340 * When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of
341 * size MB since I am not the source process, i.e. II = ( ilocblk + 1 ) * MB.
342 * When mydist >= nblocks - ilocblk * NPROW and I don't own I, I own ilocblk
343 * full blocks of size MB, i.e. II = ilocblk * MB, otherwise I own ilocblk
344 * blocks and I, in which case I can be written as IMB + (nblocks-1)*MB + IL
345 * and II = ilocblk*MB + IL = I - IMB + ( ilocblk - nblocks + 1 )*MB.
346 */
347 if( nblocks < NPROW )
348 {
349 mydist -= nblocks;
350 *II = ( ( mydist < 0 ) ? mb : ( ( MYROW == *PROW ) ?
351 i1 + ( 1 - nblocks ) * mb : 0 ) );
352 }
353 else
354 {
355 ilocblk = nblocks / NPROW;
356 mydist -= nblocks - ilocblk * NPROW;
357 *II = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb :
358 ( ( MYROW == *PROW ) ?
359 ( ilocblk - nblocks + 1 ) * mb + i1 :
360 ilocblk * mb ) );
361 }
362 }
363 /*
364 * Update the size of first block
365 */
366 *IMB1 = nblocks * mb - i1;
367 }
368 /*
369 * Now everything is just like M, I=0, IMB1, MB, PROW, NPROW. The discussion
370 * goes as follows: compute my distance from the source process PROW so that
371 * within this process coordinate system, the source process is the process
372 * such that mydist = 0. Figure out MP.
373 */
374 if( M <= *IMB1 )
375 {
376 /*
377 * M <= IMB1: if I am the source process, i.e. I own I (mydist = 0), MP is M
378 * and 0 otherwise.
379 */
380 *MP = ( ( MYROW == *PROW ) ? M : 0 );
381 }
382 else
383 {
384 /*
385 * Find out how many full blocks are globally (nblocks) and locally (ilocblk)
386 * in those M entries
387 */
388 nblocks = ( M - *IMB1 ) / mb + 1;
389
390 if( MYROW == *PROW )
391 {
392 /*
393 * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only two
394 * possible cases:
395 *
396 * 1) When mydist = nblocks - ilocblk * NPROW = 0, that is NPROW divides
397 * the global number of full blocks, then the source process PROW owns
398 * one more block than the other processes; and M can be rewritten as
399 * M = IMB1 + (nblocks-1) * NB + LNB with LNB >= 0 size of the last block.
400 * Similarly, the local value MP corresponding to M can be written as
401 * MP = IMB1 + (ilocblk-1) * MB + LMB = M + ( ilocblk-1 - (nblocks-1) )*MB.
402 * Note that this case cannot happen when ilocblk is zero, since nblocks
403 * is at least one.
404 *
405 * 2) mydist = 0 < nblocks - ilocblk * NPROW, the source process only owns
406 * full blocks, and therefore MP = IMB1 + ilocblk * MB. Note that when
407 * ilocblk is zero, MP is just IMB1.
408 */
409 if( nblocks < NPROW )
410 {
411 *MP = *IMB1;
412 }
413 else
414 {
415 ilocblk = nblocks / NPROW;
416 *MP = ( ( nblocks - ilocblk * NPROW ) ?
417 *IMB1 + ilocblk * mb :
418 M + ( ilocblk - nblocks ) * mb );
419 }
420 }
421 else
422 {
423 /*
424 * Compute my distance from the source process so that within this process
425 * coordinate system, the source process is the process such that mydist=0.
426 */
427 if( ( mydist = MYROW - *PROW ) < 0 ) mydist += NPROW;
428 /*
429 * When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of
430 * size MB since I am not the source process,
431 *
432 * when mydist > nblocks - ilocblk * NPROW, I own ilocblk full blocks of
433 * size MB since I am not the source process,
434 *
435 * when mydist = nblocks - ilocblk * NPROW,
436 * either the last block is not full and I own it, in which case
437 * M = IMB1 + (nblocks - 1)*MB + LMB with LNB the size of the last block
438 * such that MB > LMB > 0; the local value MP corresponding to M is given
439 * by MP = ilocblk * MB + LMB = M - IMB1 + ( ilocblk - nblocks + 1 ) * MB;
440 * or the last block is full and I am the first process owning only ilocblk
441 * full blocks of size MB, that is M = IMB + ( nblocks - 1 ) * MB and
442 * MP = ilocblk * MB = M - IMB + ( ilocblk - nblocks + 1 ) * MB.
443 */
444 if( nblocks < NPROW )
445 {
446 mydist -= nblocks;
447 *MP = ( ( mydist < 0 ) ? mb : ( ( mydist > 0 ) ? 0 :
448 M - *IMB1 + mb * ( 1 - nblocks ) ) );
449 }
450 else
451 {
452 ilocblk = nblocks / NPROW;
453 mydist -= nblocks - ilocblk * NPROW;
454 *MP = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb :
455 ( ( mydist > 0 ) ? ilocblk * mb :
456 M - *IMB1 + mb * ( ilocblk - nblocks + 1 ) ) );
457 }
458 }
459 }
460 /*
461 * Finally figure out IMB1 and RPROW. Note that IMB1 can be zero when M = 0.
462 */
463 *IMB1 = MIN( *IMB1, M );
464 if( ( *RPROW = MYROW - *PROW ) < 0 ) *RPROW += NPROW;
465 }
466 /*
467 * Idem for the columns
468 */
469 nb = DESC[ NB_ ];
470 csrc = DESC[ CSRC_ ];
471
472 if( ( csrc == -1 ) || ( NPCOL == 1 ) )
473 {
474 *JJ = J;
475 if( ( *INB1 = DESC[INB_] - J ) <= 0 )
476 *INB1 += ( ( -(*INB1) ) / nb + 1 ) * nb;
477 *INB1 = MIN( *INB1, N );
478 *NQ = N;
479 *PCOL = csrc;
480 *RPCOL = 0;
481 }
482 else
483 {
484 *INB1 = DESC[INB_];
485 if( J < *INB1 )
486 {
487 *PCOL = csrc;
488 *JJ = ( ( MYCOL == *PCOL ) ? J : 0 );
489 *INB1 -= J;
490 }
491 else
492 {
493 j1 = J - *INB1;
494 if( MYCOL == csrc )
495 {
496 nblocks = j1 / nb + 1;
497 *PCOL = csrc + nblocks;
498 *PCOL -= ( *PCOL / NPCOL ) * NPCOL;
499
500 if( nblocks < NPCOL )
501 {
502 *JJ = *INB1;
503 }
504 else
505 {
506 ilocblk = nblocks / NPCOL;
507 if( ilocblk * NPCOL >= nblocks )
508 {
509 *JJ = ( ( MYCOL == *PCOL ) ? J + ( ilocblk - nblocks ) * nb :
510 *INB1 + ( ilocblk - 1 ) * nb );
511 }
512 else
513 {
514 *JJ = *INB1 + ilocblk * nb;
515 }
516 }
517 }
518 else
519 {
520 nblocks = j1 / nb + 1;
521 *PCOL = csrc + nblocks;
522 *PCOL -= ( *PCOL / NPCOL ) * NPCOL;
523
524 if( ( mydist = MYCOL - csrc ) < 0 ) mydist += NPCOL;
525
526 if( nblocks < NPCOL )
527 {
528 mydist -= nblocks;
529 *JJ = ( ( mydist < 0 ) ? nb : ( ( MYCOL == *PCOL ) ?
530 j1 + ( 1 - nblocks ) * nb : 0 ) );
531 }
532 else
533 {
534 ilocblk = nblocks / NPCOL;
535 mydist -= nblocks - ilocblk * NPCOL;
536 *JJ = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb :
537 ( ( MYCOL == *PCOL ) ?
538 ( ilocblk - nblocks + 1 ) * nb + j1 :
539 ilocblk * nb ) );
540 }
541 }
542 *INB1 = nblocks * nb - j1;
543 }
544
545 if( N <= *INB1 )
546 {
547 *NQ = ( ( MYCOL == *PCOL ) ? N : 0 );
548 }
549 else
550 {
551 nblocks = ( N - *INB1 ) / nb + 1;
552
553 if( MYCOL == *PCOL )
554 {
555 if( nblocks < NPCOL )
556 {
557 *NQ = *INB1;
558 }
559 else
560 {
561 ilocblk = nblocks / NPCOL;
562 *NQ = ( ( nblocks - ilocblk * NPCOL ) ?
563 *INB1 + ilocblk * nb :
564 N + ( ilocblk - nblocks ) * nb );
565 }
566 }
567 else
568 {
569 if( ( mydist = MYCOL - *PCOL ) < 0 ) mydist += NPCOL;
570
571 if( nblocks < NPCOL )
572 {
573 mydist -= nblocks;
574 *NQ = ( ( mydist < 0 ) ? nb : ( ( mydist > 0 ) ? 0 :
575 N - *INB1 + nb * ( 1 - nblocks ) ) );
576 }
577 else
578 {
579 ilocblk = nblocks / NPCOL;
580 mydist -= nblocks - ilocblk * NPCOL;
581 *NQ = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb :
582 ( ( mydist > 0 ) ? ilocblk * nb :
583 N - *INB1 + nb * ( ilocblk - nblocks + 1 ) ) );
584 }
585 }
586 }
587 *INB1 = MIN( *INB1, N );
588 if( ( *RPCOL = MYCOL - *PCOL ) < 0 ) *RPCOL += NPCOL;
589 }
590 /*
591 * End of PB_Cainfog2l
592 */
593 }
594