1 SUBROUTINE PDROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, 2 $ RSRC, CSRC, RDEST, CDEST, WORK) 3* 4* -- ScaLAPACK tools routine (version 1.7) -- 5* University of Tennessee, Knoxville, Oak Ridge National Laboratory, 6* and University of California, Berkeley. 7* May 1, 1997 8* 9* .. Scalar Arguments .. 10 INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, 11 $ RDEST, RSRC 12* .. 13* .. Array Arguments .. 14 DOUBLE PRECISION VD( LDVD, * ), VS( LDVS, * ), WORK( * ) 15* .. 16* 17* Purpose 18* ======= 19* 20* Take a block of vectors with M total rows which are distributed over 21* a row of processes, and distribute those rows over a column of 22* processes. This routine minimizes communication by sending all 23* information it has that a given process in the CDEST needs at once. 24* To do this it uses the least common multiple (LCM) concept. This is 25* simply the realization that if I have part of a vector split over a 26* process row consisting of Q processes, and I want to send all of that 27* vector that I own to a new vector distributed over P processes within 28* a process column, that after I find the process in RDEST that owns 29* the row of the vector I'm currently looking at, he will want every 30* ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). 31* 32* Arguments 33* ========= 34* 35* Rem: MP, resp. NQ, denotes the number of local rows, resp. local 36* ==== columns, necessary to store a global vector of dimension M 37* across P processes, resp. N over Q processes. 38* 39* ICTXT (global input) INTEGER 40* The BLACS context handle, indicating the global context of 41* the operation. The context itself is global. 42* 43* M (global input) INTEGER 44* The number of global rows each vector has. 45* 46* N (global input) INTEGER 47* The number of vectors in the vector block 48* 49* NB (global input) INTEGER 50* The blocking factor used to divide the rows of the vector 51* amongst the processes of a row. 52* 53* VS (local input) DOUBLE PRECISION 54* Array of dimension (LDVS,N), the block of vectors stored on 55* process row RSRC to be put into memory VD, and stored on 56* process column CDEST. 57* 58* LDVS (local input) INTEGER 59* The leading dimension of VS. 60* 61* VD (local output) DOUBLE PRECISION 62* Array of dimension (LDVD,N), on output, the contents of VD 63* stored on process column CDEST will be here. 64* 65* LDVD (local input) INTEGER 66* The leading dimension of VD. 67* 68* RSRC (global input) INTEGER 69* The process row VS is distributed over. 70* 71* CSRC (global input) INTEGER 72* The process column the distributed block of vectors VS 73* begins on. 74* 75* RDEST (global input) INTEGER 76* The process row that VD begins on. 77* 78* CDEST (global input) INTEGER 79* The process column to distribute VD over. 80* 81* WORK (local workspace) DOUBLE PRECISION 82* Array, dimension (LDW). The required size of work varies: 83* if( nprow.eq.npcol ) then 84* LDW = 0; WORK not accessed. 85* else 86* lcm = least common multiple of process rows and columns. 87* Mq = number of rows of VS on my process. 88* npcol = number of process columns 89* CEIL = the ceiling of given operation 90* LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) 91* end if 92* 93* ===================================================================== 94* 95* .. Local Scalars .. 96 INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, 97 $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, 98 $ NBLOCKS, NPCOL, NPROW, RBLKSKIP 99* .. 100* .. External Subroutines .. 101 EXTERNAL BLACS_GRIDINFO, DGESD2D, DGERV2D, DLACPY 102* .. 103* .. External Functions .. 104 INTEGER ILCM, NUMROC 105 EXTERNAL ILCM, NUMROC 106* .. 107* .. Executable Statements .. 108* 109* 110* .. Initialize Variables .. 111* 112 ICPY = 0 113* 114 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) 115* 116* If we are not in special case for NPROW = NPCOL where there is no 117* copying required 118* 119 IF( NPROW .NE. NPCOL ) THEN 120 LCM = ILCM( NPROW, NPCOL ) 121 RBLKSKIP = LCM / NPCOL 122 CBLKSKIP = LCM / NPROW 123* 124* If I have part of VS, the source vector(s) 125* 126 IF( MYROW.EQ.RSRC ) THEN 127* 128 ISTART = 1 129* 130* Figure my distance from CSRC: the process in CDEST the same 131* distance from RDEST will want my first block 132* 133 MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) 134 MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) 135 IRDEST = MOD( RDEST+MYDIST, NPROW ) 136* 137* Loop over all possible destination processes 138* 139 DO 20 K = 1, RBLKSKIP 140 JJ = 1 141* 142* If I am not destination process 143* 144 IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN 145* 146* Pack all data I own that destination needs 147* 148 DO 10 II = ISTART, MQ, NB*RBLKSKIP 149 JB = MIN( NB, MQ-II+1 ) 150 CALL DLACPY( 'G', JB, N, VS(II,1), LDVS, 151 $ WORK(JJ), JB ) 152 JJ = JJ + NB*N 153 10 CONTINUE 154* 155* Figure how many rows are to be sent and send them if 156* necessary, NOTE: will send extra if NB > JB 157* 158 JJ = JJ - 1 159 IF( JJ.GT.0 ) 160 $ CALL DGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, 161 $ CDEST ) 162* 163* I am both source and destination, save where to start 164* copying from for later use 165* 166 ELSE 167 ICPY = ISTART 168 END IF 169* 170 ISTART = ISTART + NB 171 IRDEST = MOD( IRDEST+NPCOL, NPROW ) 172 20 CONTINUE 173 END IF 174* 175* If I should receive info into VD 176* 177 IF( MYCOL.EQ.CDEST ) THEN 178* 179 ISTART = 1 180* 181* Figure my distance from CDEST: the process in CSRC the same 182* distance from RSRC will have my first block 183* 184 MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) 185 MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) 186 ICSRC = MOD( CSRC+MYDIST, NPCOL ) 187* 188* Loop over all sending processes 189* 190 DO 50 K = 1, CBLKSKIP 191* 192* If I don't already possess the required data 193* 194 IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN 195* 196* Figure how many rows to receive, and receive them 197* NOTE: may receive to much -- NB instead of JB 198* 199 NBLOCKS = (MP - ISTART + NB) / NB 200 JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB 201 IF( JJ.GT.0 ) 202 $ CALL DGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) 203* 204* Copy data to destination vector 205* 206 JJ = 1 207 DO 30 II = ISTART, MP, NB*CBLKSKIP 208 JB = MIN( NB, MP-II+1 ) 209 CALL DLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), 210 $ LDVD ) 211 JJ = JJ + NB*N 212 30 CONTINUE 213* 214* If I am both source and destination 215* 216 ELSE 217 JJ = ICPY 218 DO 40 II = ISTART, MP, NB*CBLKSKIP 219 JB = MIN( NB, MP-II+1 ) 220 CALL DLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), 221 $ LDVD ) 222 JJ = JJ + NB*RBLKSKIP 223 40 CONTINUE 224 END IF 225 ISTART = ISTART + NB 226 ICSRC = MOD( ICSRC+NPROW, NPCOL ) 227 50 CONTINUE 228 END IF 229* 230* if NPROW = NPCOL, there is a one-to-one correspondance between 231* process rows and columns, so no work space or copying required 232* 233 ELSE 234* 235 IF( MYROW.EQ.RSRC ) THEN 236* 237* Figure my distance from CSRC: the process in CDEST the same 238* distance from RDEST will want my piece of the vector 239* 240 MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) 241 MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) 242 IRDEST = MOD( RDEST+MYDIST, NPROW ) 243 IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN 244 CALL DGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) 245 ELSE 246 CALL DLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) 247 END IF 248 END IF 249 IF( MYCOL.EQ.CDEST ) THEN 250* 251* Figure my distance from RDEST: the process in RSRC the same 252* distance from CSRC will have my piece of the vector 253* 254 MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) 255 MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) 256 ICSRC = MOD( CSRC+MYDIST, NPCOL ) 257 IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) 258 $ CALL DGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) 259 END IF 260 END IF 261* 262 RETURN 263* 264* End of PDROW2COL 265* 266 END 267