1      SUBROUTINE PDROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD,
2     $                     RSRC, CSRC, RDEST, CDEST, WORK)
3*
4*  -- ScaLAPACK tools routine (version 1.7) --
5*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6*     and University of California, Berkeley.
7*     May 1, 1997
8*
9*     .. Scalar Arguments ..
10      INTEGER            CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB,
11     $                   RDEST, RSRC
12*     ..
13*     .. Array Arguments ..
14      DOUBLE PRECISION   VD( LDVD, * ), VS( LDVS, * ), WORK( * )
15*     ..
16*
17*  Purpose
18*  =======
19*
20*  Take a block of vectors with M total rows which are distributed over
21*  a row of processes, and distribute those rows over a column of
22*  processes. This routine minimizes communication by sending all
23*  information it has that a given process in the CDEST needs at once.
24*  To do this it uses the least common multiple (LCM) concept.  This is
25*  simply the realization that if I have part of a vector split over a
26*  process row consisting of Q processes, and I want to send all of that
27*  vector that I own to a new vector distributed over P processes within
28*  a process column, that after I find the process in RDEST that owns
29*  the row of the vector I'm currently looking at, he will want every
30*  ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N).
31*
32*  Arguments
33*  =========
34*
35*  Rem:  MP, resp. NQ, denotes the number of local rows, resp. local
36*  ====  columns, necessary to store a global vector of dimension M
37*        across P processes, resp. N over Q processes.
38*
39*  ICTXT   (global input) INTEGER
40*          The BLACS context handle, indicating the global context of
41*          the operation. The context itself is global.
42*
43*  M       (global input) INTEGER
44*          The number of global rows each vector has.
45*
46*  N       (global input) INTEGER
47*          The number of vectors in the vector block
48*
49*  NB      (global input) INTEGER
50*          The blocking factor used to divide the rows of the vector
51*          amongst the processes of a row.
52*
53*  VS      (local input) DOUBLE PRECISION
54*          Array of dimension (LDVS,N), the block of vectors stored on
55*          process row RSRC to be put into memory VD, and stored on
56*          process column CDEST.
57*
58*  LDVS    (local input) INTEGER
59*          The leading dimension of VS.
60*
61*  VD      (local output) DOUBLE PRECISION
62*          Array of dimension (LDVD,N), on output, the contents of VD
63*          stored on process column CDEST will be here.
64*
65*  LDVD    (local input) INTEGER
66*          The leading dimension of VD.
67*
68*  RSRC    (global input) INTEGER
69*          The process row VS is distributed over.
70*
71*  CSRC    (global input) INTEGER
72*          The process column the distributed block of vectors VS
73*          begins on.
74*
75*  RDEST   (global input) INTEGER
76*          The process row that VD begins on.
77*
78*  CDEST   (global input) INTEGER
79*          The process column to distribute VD over.
80*
81*  WORK    (local workspace) DOUBLE PRECISION
82*          Array, dimension (LDW). The required size of work varies:
83*          if( nprow.eq.npcol ) then
84*             LDW = 0; WORK not accessed.
85*          else
86*            lcm = least common multiple of process rows and columns.
87*             Mq  = number of rows of VS on my process.
88*             npcol = number of process columns
89*             CEIL = the ceiling of given operation
90*             LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) )
91*          end if
92*
93*  =====================================================================
94*
95*     .. Local Scalars ..
96      INTEGER            CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB,
97     $                   JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW,
98     $                   NBLOCKS, NPCOL, NPROW, RBLKSKIP
99*     ..
100*     .. External Subroutines ..
101      EXTERNAL           BLACS_GRIDINFO, DGESD2D, DGERV2D, DLACPY
102*     ..
103*     .. External Functions ..
104      INTEGER            ILCM, NUMROC
105      EXTERNAL           ILCM, NUMROC
106*     ..
107*     .. Executable Statements ..
108*
109*
110*     .. Initialize Variables ..
111*
112      ICPY = 0
113*
114      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
115*
116*     If we are not in special case for NPROW = NPCOL where there is no
117*     copying required
118*
119      IF( NPROW .NE. NPCOL ) THEN
120         LCM = ILCM( NPROW, NPCOL )
121         RBLKSKIP = LCM / NPCOL
122         CBLKSKIP = LCM / NPROW
123*
124*        If I have part of VS, the source vector(s)
125*
126         IF( MYROW.EQ.RSRC ) THEN
127*
128            ISTART = 1
129*
130*           Figure my distance from CSRC: the process in CDEST the same
131*           distance from RDEST will want my first block
132*
133            MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL )
134            MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL )
135            IRDEST = MOD( RDEST+MYDIST, NPROW )
136*
137*           Loop over all possible destination processes
138*
139            DO 20 K = 1, RBLKSKIP
140               JJ = 1
141*
142*              If I am not destination process
143*
144               IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN
145*
146*                 Pack all data I own that destination needs
147*
148                  DO 10 II = ISTART, MQ, NB*RBLKSKIP
149                     JB = MIN( NB, MQ-II+1 )
150                     CALL DLACPY( 'G', JB, N, VS(II,1), LDVS,
151     $                            WORK(JJ), JB )
152                     JJ = JJ + NB*N
153   10             CONTINUE
154*
155*                 Figure how many rows are to be sent and send them if
156*                 necessary, NOTE: will send extra if NB > JB
157*
158                  JJ = JJ - 1
159                  IF( JJ.GT.0 )
160     $               CALL DGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST,
161     $                             CDEST )
162*
163*              I am both source and destination, save where to start
164*              copying from for later use
165*
166               ELSE
167                  ICPY = ISTART
168               END IF
169*
170               ISTART = ISTART + NB
171               IRDEST = MOD( IRDEST+NPCOL, NPROW )
172   20       CONTINUE
173         END IF
174*
175*        If I should receive info into VD
176*
177         IF( MYCOL.EQ.CDEST ) THEN
178*
179            ISTART = 1
180*
181*           Figure my distance from CDEST: the process in CSRC the same
182*           distance from RSRC will have my first block
183*
184            MYDIST = MOD( NPROW+MYROW-RDEST, NPROW )
185            MP = NUMROC( M, NB, MYROW, RDEST, NPROW )
186            ICSRC = MOD( CSRC+MYDIST, NPCOL )
187*
188*           Loop over all sending processes
189*
190            DO 50 K = 1, CBLKSKIP
191*
192*              If I don't already possess the required data
193*
194               IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN
195*
196*                 Figure how many rows to receive, and receive them
197*                 NOTE: may receive to much -- NB instead of JB
198*
199                  NBLOCKS = (MP - ISTART + NB) / NB
200                  JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB
201                  IF( JJ.GT.0 )
202     $               CALL DGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC )
203*
204*                 Copy data to destination vector
205*
206                  JJ = 1
207                  DO 30 II = ISTART, MP, NB*CBLKSKIP
208                     JB = MIN( NB, MP-II+1 )
209                     CALL DLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1),
210     $                            LDVD )
211                     JJ = JJ + NB*N
212   30             CONTINUE
213*
214*              If I am both source and destination
215*
216               ELSE
217                  JJ = ICPY
218                  DO 40 II = ISTART, MP, NB*CBLKSKIP
219                     JB = MIN( NB, MP-II+1 )
220                     CALL DLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1),
221     $                            LDVD )
222                     JJ = JJ + NB*RBLKSKIP
223   40             CONTINUE
224               END IF
225               ISTART = ISTART + NB
226               ICSRC = MOD( ICSRC+NPROW, NPCOL )
227   50       CONTINUE
228         END IF
229*
230*     if NPROW = NPCOL, there is a one-to-one correspondance between
231*     process rows and columns, so no work space or copying required
232*
233      ELSE
234*
235         IF( MYROW.EQ.RSRC ) THEN
236*
237*           Figure my distance from CSRC: the process in CDEST the same
238*           distance from RDEST will want my piece of the vector
239*
240            MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL )
241            MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL )
242            IRDEST = MOD( RDEST+MYDIST, NPROW )
243            IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN
244               CALL DGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST )
245            ELSE
246               CALL DLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD )
247            END IF
248         END IF
249         IF( MYCOL.EQ.CDEST ) THEN
250*
251*           Figure my distance from RDEST: the process in RSRC the same
252*           distance from CSRC will have my piece of the vector
253*
254            MYDIST = MOD( NPROW+MYROW-RDEST, NPROW )
255            MP = NUMROC( M, NB, MYROW, RDEST, NPROW )
256            ICSRC = MOD( CSRC+MYDIST, NPCOL )
257            IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) )
258     $         CALL DGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC )
259         END IF
260      END IF
261*
262      RETURN
263*
264*     End of PDROW2COL
265*
266      END
267