1 #include "redist.h"
2 /** $Id: psgemr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $
3   ------------------------------------------------------------------------
4 
5     -- ScaLAPACK routine (version 1.7) --
6        Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of
7        California, Berkeley.
8        October 31, 1994.
9 
10       SUBROUTINE PSGEMR2D( M, N,
11      $                     A, IA, JA, ADESC,
12      $                     B, IB, JB, BDESC,
13      $                     CTXT)
14   ------------------------------------------------------------------------
15     Purpose
16     =======
17 
18     PSGEMR2D copies a submatrix of A on a submatrix of B.
19     A and B can have different distributions: they can be on different
20     processor grids, they can have different blocksizes, the beginning
21     of the area to be copied can be at a different places on A and B.
22 
23     The parameters can be confusing when the grids of A and B are
24     partially or completly disjoint, in the case a processor calls
25     this routines but is either not in the A context or B context, the
26     ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the
27     routine recognise this situation.
28     To summarize the rule:
29     - If a processor is in A context, all parameters related to A must be valid.
30     - If a processor is in B context, all parameters related to B must be valid.
31     -  ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1.
32     - M and N must be valid for everyone.
33     - other parameters are not examined.
34 
35 
36     Notes
37     =====
38 
39     A description vector is associated with each 2D block-cyclicly dis-
40     tributed matrix.  This vector stores the information required to
41     establish the mapping between a matrix entry and its corresponding
42     process and memory location.
43 
44     In the following comments, the character _ should be read as
45     "of the distributed matrix".  Let A be a generic term for any 2D
46     block cyclicly distributed matrix.  Its description vector is DESC_A:
47 
48    NOTATION        STORED IN      EXPLANATION
49    --------------- -------------- --------------------------------------
50    DT_A   (global) DESCA( DT_ )   The descriptor type.
51    CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
52                                   the BLACS process grid A is distribu-
53                                   ted over. The context itself is glo-
54                                   bal, but the handle (the integer
55                                   value) may vary.
56    M_A    (global) DESCA( M_ )    The number of rows in the distributed
57                                   matrix A.
58    N_A    (global) DESCA( N_ )    The number of columns in the distri-
59                                   buted matrix A.
60    MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
61                                   the rows of A.
62    NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
63                                   the columns of A.
64    RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
65                                   row of the matrix A is distributed.
66    CSRC_A (global) DESCA( CSRC_ ) The process column over which the
67                                   first column of A is distributed.
68    LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
69                                   array storing the local blocks of the
70                                   distributed matrix A.
71                                   LLD_A >= MAX(1,LOCp(M_A)).
72 
73 
74 
75     Important notice
76     ================
77      The parameters of the routine have changed in April 1996
78      There is a new last argument. It must be a context englobing
79      all processors involved in the initial and final distribution.
80 
81      Be aware that all processors  included in this
82       context must call the redistribution routine.
83 
84     Parameters
85     ==========
86 
87 
88     M        (input) INTEGER.
89              On entry, M specifies the number of rows of the
90              submatrix to be copied.  M must be at least zero.
91              Unchanged on exit.
92 
93     N        (input) INTEGER.
94              On entry, N specifies the number of cols of the submatrix
95              to be redistributed.rows of B.  M must be at least zero.
96              Unchanged on exit.
97 
98     A        (input) REAL
99              On entry, the source matrix.
100              Unchanged on exit.
101 
102     IA,JA    (input) INTEGER
103              On entry,the coordinates of the beginning of the submatrix
104              of A to copy.
105              1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1,
106              Unchanged on exit.
107 
108     ADESC    (input) A description vector (see Notes above)
109              If the current processor is not part of the context of A
110              the ADESC[CTXT] must be equal to -1.
111 
112 
113     B        (output) REAL
114              On entry, the destination matrix.
115              The portion corresponding to the defined submatrix are updated.
116 
117     IB,JB    (input) INTEGER
118              On entry,the coordinates of the beginning of the submatrix
119              of B that will be updated.
120              1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1,
121              Unchanged on exit.
122 
123     BDESC    (input) B description vector (see Notes above)
124              For processors not part of the context of B
125              BDESC[CTXT] must be equal to -1.
126 
127     CTXT     (input) a context englobing at least all processors included
128                 in either A context or B context
129 
130 
131 
132    Memory requirement :
133    ====================
134 
135    for the processors belonging to grid 0, one buffer of size block 0
136    and for the processors belonging to grid 1, also one buffer of size
137    block 1.
138 
139    ============================================================
140    Created March 1993 by B. Tourancheau (See sccs for modifications).
141    Modifications by Loic PRYLLI 1995
142    ============================================================ */
143 #define static2 static
144 #if defined(Add_) || defined(f77IsF2C)
145 #define fortran_mr2d psgemr2do_
146 #define fortran_mr2dnew psgemr2d_
147 #elif defined(UpCase)
148 #define fortran_mr2dnew PSGEMR2D
149 #define fortran_mr2d PSGEMR2DO
150 #define scopy_ SCOPY
151 #define slacpy_ SLACPY
152 #else
153 #define fortran_mr2d psgemr2do
154 #define fortran_mr2dnew psgemr2d
155 #define scopy_ scopy
156 #define slacpy_ slacpy
157 #endif
158 #define Clacpy Csgelacpy
159 void  Clacpy();
160 typedef struct {
161   int   desctype;
162   int   ctxt;
163   int   m;
164   int   n;
165   int   nbrow;
166   int   nbcol;
167   int   sprow;
168   int   spcol;
169   int   lda;
170 }     MDESC;
171 #define BLOCK_CYCLIC_2D 1
172 typedef struct {
173   int   lstart;
174   int   len;
175 }     IDESC;
176 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
177 #define max(A,B) ((A)>(B)?(A):(B))
178 #define min(A,B) ((A)>(B)?(B):(A))
179 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
180 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
181 #ifdef MALLOCDEBUG
182 #define malloc mymalloc
183 #define free myfree
184 #define realloc myrealloc
185 #endif
186 /* Cblacs */
187 extern void Cblacs_pcoord();
188 extern int Cblacs_pnum();
189 extern void Csetpvmtids();
190 extern void Cblacs_get();
191 extern void Cblacs_pinfo();
192 extern void Cblacs_gridinfo();
193 extern void Cblacs_gridinit();
194 extern void Cblacs_exit();
195 extern void Cblacs_gridexit();
196 extern void Cblacs_setup();
197 extern void Cigebs2d();
198 extern void Cigebr2d();
199 extern void Cigesd2d();
200 extern void Cigerv2d();
201 extern void Cigsum2d();
202 extern void Cigamn2d();
203 extern void Cigamx2d();
204 extern void Csgesd2d();
205 extern void Csgerv2d();
206 /* lapack */
207 void  slacpy_();
208 /* aux fonctions */
209 extern int localindice();
210 extern void *mr2d_malloc();
211 extern int ppcm();
212 extern int localsize();
213 extern int memoryblocksize();
214 extern int changeorigin();
215 extern void paramcheck();
216 /* tools and others function */
217 #define scanD0 sgescanD0
218 #define dispmat sgedispmat
219 #define setmemory sgesetmemory
220 #define freememory sgefreememory
221 #define scan_intervals sgescan_intervals
222 extern void scanD0();
223 extern void dispmat();
224 extern void setmemory();
225 extern void freememory();
226 extern int scan_intervals();
227 extern void Cpsgemr2do();
228 extern void Cpsgemr2d();
229 /* some defines for Cpsgemr2do */
230 #define SENDBUFF 0
231 #define RECVBUFF 1
232 #define SIZEBUFF 2
233 #if 0
234 #define DEBUG
235 #endif
236 #ifndef DEBUG
237 #define NDEBUG
238 #endif
239 #include <stdio.h>
240 #include <stdlib.h>
241 #include <assert.h>
242 #define DESCLEN 9
243 void
fortran_mr2d(m,n,A,ia,ja,desc_A,B,ib,jb,desc_B)244 fortran_mr2d(m, n, A, ia, ja, desc_A,
245 	     B, ib, jb, desc_B)
246   int  *ia, *ib, *ja, *jb, *m, *n;
247   int   desc_A[DESCLEN], desc_B[DESCLEN];
248   float *A, *B;
249 {
250   Cpsgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A,
251 	     B, *ib, *jb, (MDESC *) desc_B);
252   return;
253 }
254 void
fortran_mr2dnew(m,n,A,ia,ja,desc_A,B,ib,jb,desc_B,gcontext)255 fortran_mr2dnew(m, n, A, ia, ja, desc_A,
256 		B, ib, jb, desc_B, gcontext)
257   int  *ia, *ib, *ja, *jb, *m, *n;
258   int   desc_A[DESCLEN], desc_B[DESCLEN];
259   float *A, *B;
260   int  *gcontext;
261 {
262   Cpsgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A,
263 	    B, *ib, *jb, (MDESC *) desc_B, *gcontext);
264   return;
265 }
266 static2 void init_chenille();
267 static2 int inter_len();
268 static2 int block2buff();
269 static2 void buff2block();
270 static2 void gridreshape();
271 void
Cpsgemr2do(m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb)272 Cpsgemr2do(m, n,
273 	   ptrmyblock, ia, ja, ma,
274 	   ptrmynewblock, ib, jb, mb)
275   float *ptrmyblock, *ptrmynewblock;
276 /* pointers to the memory location of the matrix and the redistributed matrix */
277   MDESC *ma;
278   MDESC *mb;
279   int   ia, ja, ib, jb, m, n;
280 {
281   int   dummy, nprocs;
282   int   gcontext;
283   /* first we initialize a global grid which serve as a reference to
284    * communicate from grid a to grid b */
285   Cblacs_pinfo(&dummy, &nprocs);
286   Cblacs_get(0, 0, &gcontext);
287   Cblacs_gridinit(&gcontext, "R", 1, nprocs);
288   Cpsgemr2d(m, n, ptrmyblock, ia, ja, ma,
289 	    ptrmynewblock, ib, jb, mb, gcontext);
290   Cblacs_gridexit(gcontext);
291 }
292 #define NBPARAM 20	/* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
293 			 * idem B puis ia,ja puis ib,jb */
294 #define MAGIC_MAX 100000000
295 void
Cpsgemr2d(m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb,globcontext)296 Cpsgemr2d(m, n,
297 	  ptrmyblock, ia, ja, ma,
298 	  ptrmynewblock, ib, jb, mb, globcontext)
299   float *ptrmyblock, *ptrmynewblock;
300 /* pointers to the memory location of the matrix and the redistributed matrix */
301   MDESC *ma;
302   MDESC *mb;
303   int   ia, ja, ib, jb, m, n, globcontext;
304 {
305   float *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
306   float *recvptr;
307   MDESC newa, newb;
308   int  *proc0, *proc1, *param;
309   int   mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
310   int   i, j;
311   int   nprow, npcol, gcontext;
312   int   recvsize, sendsize;
313   IDESC *h_inter;	/* to store the horizontal intersections */
314   IDESC *v_inter;	/* to store the vertical intersections */
315   int   hinter_nb, vinter_nb;	/* number of intrsections in both directions */
316   int   dummy;
317   int   p0, q0, p1, q1;
318   int  *ra, *ca;
319   /* end of variables */
320   /* To simplify further calcul we change the matrix indexation from
321    * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
322   if (m == 0 || n == 0)
323     return;
324   ia -= 1;
325   ja -= 1;
326   ib -= 1;
327   jb -= 1;
328   Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
329   gcontext = globcontext;
330   nprocs = nprow * npcol;
331   /* if the global context that is given to us has not the shape of a line
332    * (nprow != 1), create a new context.  TODO: to be optimal, we should
333    * avoid this because it is an uncessary synchronisation */
334   if (nprow != 1) {
335     gridreshape(&gcontext);
336     Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
337   }
338   Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
339   /* compatibility T3D, must check myprow  and mypcol are within bounds */
340   if (myprow0 >= p0 || mypcol0 >= q0)
341     myprow0 = mypcol0 = -1;
342   assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
343   Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
344   if (myprow1 >= p1 || mypcol1 >= q1)
345     myprow1 = mypcol1 = -1;
346   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
347   /* exchange the missing parameters among the processors: shape of grids and
348    * location of the processors */
349   param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
350   ra = param + nprocs * 2 + NBPARAM;
351   ca = param + (nprocs * 2 + NBPARAM) * 2;
352   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
353     param[i] = MAGIC_MAX;
354   proc0 = param + NBPARAM;
355   proc1 = param + NBPARAM + nprocs;
356   /* we calulate proc0 and proc1 that will give the number of a proc in
357    * respectively a or b in the global context */
358   if (myprow0 >= 0) {
359     proc0[myprow0 * q0 + mypcol0] = mypnum;
360     param[0] = p0;
361     param[1] = q0;
362     param[4] = ma->m;
363     param[5] = ma->n;
364     param[6] = ma->nbrow;
365     param[7] = ma->nbcol;
366     param[8] = ma->sprow;
367     param[9] = ma->spcol;
368     param[10] = ia;
369     param[11] = ja;
370   }
371   if (myprow1 >= 0) {
372     proc1[myprow1 * q1 + mypcol1] = mypnum;
373     param[2] = p1;
374     param[3] = q1;
375     param[12] = mb->m;
376     param[13] = mb->n;
377     param[14] = mb->nbrow;
378     param[15] = mb->nbcol;
379     param[16] = mb->sprow;
380     param[17] = mb->spcol;
381     param[18] = ib;
382     param[19] = jb;
383   }
384   Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
385 	   ra, ca, 2 * nprocs + NBPARAM, -1, -1);
386   newa = *ma;
387   newb = *mb;
388   ma = &newa;
389   mb = &newb;
390   if (myprow0 == -1) {
391     p0 = param[0];
392     q0 = param[1];
393     ma->m = param[4];
394     ma->n = param[5];
395     ma->nbrow = param[6];
396     ma->nbcol = param[7];
397     ma->sprow = param[8];
398     ma->spcol = param[9];
399     ia = param[10];
400     ja = param[11];
401   }
402   if (myprow1 == -1) {
403     p1 = param[2];
404     q1 = param[3];
405     mb->m = param[12];
406     mb->n = param[13];
407     mb->nbrow = param[14];
408     mb->nbcol = param[15];
409     mb->sprow = param[16];
410     mb->spcol = param[17];
411     ib = param[18];
412     jb = param[19];
413   }
414   for (i = 0; i < NBPARAM; i++) {
415     if (param[i] == MAGIC_MAX) {
416       fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
417       exit(1);
418     }
419   }
420 #ifndef NDEBUG
421   for (i = 0; i < p0 * q0; i++)
422     assert(proc0[i] >= 0 && proc0[i] < nprocs);
423   for (i = 0; i < p1 * q1; i++)
424     assert(proc1[i] >= 0 && proc1[i] < nprocs);
425 #endif
426   /* check the validity of the parameters */
427   paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
428   paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
429   /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
430   {
431     int   decal;
432     ia = changeorigin(myprow0, ma->sprow, p0,
433 		      ma->nbrow, ia, &decal, &ma->sprow);
434     ptrmyblock += decal;
435     ja = changeorigin(mypcol0, ma->spcol, q0,
436 		      ma->nbcol, ja, &decal, &ma->spcol);
437     ptrmyblock += decal * ma->lda;
438     ma->m = ia + m;
439     ma->n = ja + n;
440     ib = changeorigin(myprow1, mb->sprow, p1,
441 		      mb->nbrow, ib, &decal, &mb->sprow);
442     ptrmynewblock += decal;
443     jb = changeorigin(mypcol1, mb->spcol, q1,
444 		      mb->nbcol, jb, &decal, &mb->spcol);
445     ptrmynewblock += decal * mb->lda;
446     mb->m = ib + m;
447     mb->n = jb + n;
448     if (p0 == 1)
449       ma->nbrow = ma->m;
450     if (q0 == 1)
451       ma->nbcol = ma->n;
452     if (p1 == 1)
453       mb->nbrow = mb->m;
454     if (q1 == 1)
455       mb->nbcol = mb->n;
456 #ifndef NDEBUG
457     paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
458     paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
459 #endif
460   }
461   /* We compute the size of the memory buffer ( we choose the worst case,
462    * when the buffer sizes == the memory block sizes). */
463   if (myprow0 >= 0 && mypcol0 >= 0) {
464     /* Initialize pointer variables */
465     setmemory(&ptrsendbuff, memoryblocksize(ma));
466   };	/* if (mypnum < p0 * q0) */
467   if (myprow1 >= 0 && mypcol1 >= 0) {
468     /* Initialize pointer variables */
469     setmemory(&ptrrecvbuff, memoryblocksize(mb));
470   };	/* if (mypnum < p1 * q1) */
471   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
472    * intervals, in fact the worst case should be less, perhaps half that,I
473    * should think of that one day. */
474   h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
475 				  ma->nbcol * sizeof(IDESC));
476   v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
477 				  * ma->nbrow * sizeof(IDESC));
478   /* We go for the scanning of indices. For each processor including mypnum,
479    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
480    * it. Then for each processor, we compute the size of message to be
481    * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
482    * of recvbuff the right place (scanD)(RECVBUFF)) */
483   recvptr = ptrrecvbuff;
484   {
485     int   tot, myrang, step, sens;
486     int  *sender, *recver;
487     int   mesending, merecving;
488     tot = max(p0 * q0, p1 * q1);
489     init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
490 		  &sender, &recver, &myrang);
491     if (myrang == -1)
492       goto after_comm;
493     mesending = myprow0 >= 0;
494     assert(sender[myrang] >= 0 || !mesending);
495     assert(!mesending || proc0[sender[myrang]] == mypnum);
496     merecving = myprow1 >= 0;
497     assert(recver[myrang] >= 0 || !merecving);
498     assert(!merecving || proc1[recver[myrang]] == mypnum);
499     step = tot - 1 - myrang;
500     do {
501       for (sens = 0; sens < 2; sens++) {
502 	/* be careful here, when we communicating with ourselves, we must
503 	 * send first (myrang > step == 0) */
504 	if (mesending && recver[step] >= 0 &&
505 	    (sens == 0)) {
506 	  i = recver[step] / q1;
507 	  j = recver[step] % q1;
508 	  vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
509 				     v_inter);
510 	  hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
511 				     h_inter);
512 	  sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb,
513 				ptrmyblock, ma, ptrsendbuff);
514 	}	/* if (mesending...) { */
515 	if (mesending && recver[step] >= 0 &&
516 	    (sens == myrang > step)) {
517 	  i = recver[step] / q1;
518 	  j = recver[step] % q1;
519 	  if (sendsize > 0
520 	      && (step != myrang || !merecving)
521 		) {
522 	    Csgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
523 		     0, proc1[i * q1 + j]);
524 	  }	/* sendsize > 0 */
525 	}	/* if (mesending ... */
526 	if (merecving && sender[step] >= 0 &&
527 	    (sens == myrang <= step)) {
528 	  i = sender[step] / q0;
529 	  j = sender[step] % q0;
530 	  vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
531 				     v_inter);
532 	  hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
533 				     h_inter);
534 	  recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter);
535 	  if (recvsize > 0) {
536 	    if (step == myrang && mesending) {
537 	      Clacpy(recvsize, 1,
538 		     ptrsendbuff, recvsize,
539 		     ptrrecvbuff, recvsize);
540 	    } else {
541 	      Csgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
542 		       0, proc0[i * q0 + j]);
543 	    }
544 	  }	/* recvsize > 0 */
545 	}	/* if (merecving ...) */
546 	if (merecving && sender[step] >= 0 && sens == 1) {
547 	  buff2block(v_inter, vinter_nb, h_inter, hinter_nb,
548 		     recvptr, ptrmynewblock, mb);
549 	}	/* if (merecving...)  */
550       }	/* for (sens = 0) */
551       step -= 1;
552       if (step < 0)
553 	step = tot - 1;
554     } while (step != tot - 1 - myrang);
555 after_comm:
556     free(sender);
557   }	/* { int tot,nr,ns ...} */
558   /* don't forget to clean up things! */
559   if (myprow1 >= 0 && mypcol1 >= 0) {
560     freememory((char *) ptrrecvbuff);
561   };
562   if (myprow0 >= 0 && mypcol0 >= 0) {
563     freememory((char *) ptrsendbuff);
564   };
565   if (nprow != 1)
566     Cblacs_gridexit(gcontext);
567   free(v_inter);
568   free(h_inter);
569   free(param);
570 }/* distrib */
571 static2 void
init_chenille(mypnum,nprocs,n0,proc0,n1,proc1,psend,precv,myrang)572 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
573   int   nprocs, mypnum, n0, n1;
574   int  *proc0, *proc1, **psend, **precv, *myrang;
575 {
576   int   ns, nr, i, tot;
577   int  *sender, *recver, *g0, *g1;
578   tot = max(n0, n1);
579   sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
580   recver = sender + tot;
581   *psend = sender;
582   *precv = recver;
583   g0 = recver + tot;
584   g1 = g0 + nprocs;
585   for (i = 0; i < nprocs; i++) {
586     g0[i] = -1;
587     g1[i] = -1;
588   }
589   for (i = 0; i < tot; i++) {
590     sender[i] = -1;
591     recver[i] = -1;
592   }
593   for (i = 0; i < n0; i++)
594     g0[proc0[i]] = i;
595   for (i = 0; i < n1; i++)
596     g1[proc1[i]] = i;
597   ns = 0;
598   nr = 0;
599   *myrang = -1;
600   for (i = 0; i < nprocs; i++)
601     if (g0[i] >= 0 && g1[i] >= 0) {
602       if (i == mypnum)
603 	*myrang = nr;
604       sender[ns] = g0[i];
605       ns += 1;
606       recver[nr] = g1[i];
607       nr += 1;
608       assert(ns <= n0 && nr <= n1 && nr == ns);
609     }
610   for (i = 0; i < nprocs; i++)
611     if (g0[i] >= 0 && g1[i] < 0) {
612       if (i == mypnum)
613 	*myrang = ns;
614       sender[ns] = g0[i];
615       ns += 1;
616       assert(ns <= n0);
617     }
618   for (i = 0; i < nprocs; i++)
619     if (g1[i] >= 0 && g0[i] < 0) {
620       if (i == mypnum)
621 	*myrang = nr;
622       recver[nr] = g1[i];
623       nr += 1;
624       assert(nr <= n1);
625     }
626 }
627 #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \
628 { \
629 float *_a,*_b; \
630 int _m,_n,_lda,_ldb; \
631     int _i,_j; \
632     _m = (mo);_n = (no); \
633     _a = (ao);_b = (bo); \
634     _lda = (ldao) - _m; \
635     _ldb = (ldbo) - _m; \
636     assert(_lda >= 0 && _ldb >= 0); \
637     for (_j=0;_j<_n;_j++) { \
638       for (_i=0;_i<_m;_i++) \
639         *_b++ = *_a++; \
640       _b += _ldb; \
641       _a += _lda; \
642     } \
643 }
644 static2 int
block2buff(vi,vinb,hi,hinb,ptra,ma,buff)645 block2buff(vi, vinb, hi, hinb, ptra, ma, buff)
646   int   hinb, vinb;
647   IDESC *hi, *vi;
648   MDESC *ma;
649   float *buff, *ptra;
650 {
651   int   h, v, sizebuff;
652   float *ptr2;
653   sizebuff = 0;
654   for (h = 0; h < hinb; h++) {
655     ptr2 = ptra + hi[h].lstart * ma->lda;
656     for (v = 0; v < vinb; v++) {
657       Mlacpy(vi[v].len, hi[h].len,
658 	     ptr2 + vi[v].lstart,
659 	     ma->lda,
660 	     buff + sizebuff, vi[v].len);
661       sizebuff += hi[h].len * vi[v].len;
662     }
663   }
664   return sizebuff;
665 }
666 static2 void
buff2block(vi,vinb,hi,hinb,buff,ptrb,mb)667 buff2block(vi, vinb, hi, hinb, buff, ptrb, mb)
668   int   hinb, vinb;
669   IDESC *hi, *vi;
670   MDESC *mb;
671   float *buff, *ptrb;
672 {
673   int   h, v, sizebuff;
674   float *ptr2;
675   sizebuff = 0;
676   for (h = 0; h < hinb; h++) {
677     ptr2 = ptrb + hi[h].lstart * mb->lda;
678     for (v = 0; v < vinb; v++) {
679       Mlacpy(vi[v].len, hi[h].len,
680 	     buff + sizebuff, vi[v].len,
681 	     ptr2 + vi[v].lstart,
682 	     mb->lda);
683       sizebuff += hi[h].len * vi[v].len;
684     }
685   }
686 }
687 static2 int
inter_len(hinb,hi,vinb,vi)688 inter_len(hinb, hi, vinb, vi)
689   int   hinb, vinb;
690   IDESC *hi, *vi;
691 {
692   int   hlen, vlen, h, v;
693   hlen = 0;
694   for (h = 0; h < hinb; h++)
695     hlen += hi[h].len;
696   vlen = 0;
697   for (v = 0; v < vinb; v++)
698     vlen += vi[v].len;
699   return hlen * vlen;
700 }
701 void
Clacpy(m,n,a,lda,b,ldb)702 Clacpy(m, n, a, lda, b, ldb)
703   float *a, *b;
704   int   m, n, lda, ldb;
705 {
706   int   i, j;
707   lda -= m;
708   ldb -= m;
709   assert(lda >= 0 && ldb >= 0);
710   for (j = 0; j < n; j++) {
711     for (i = 0; i < m; i++)
712       *b++ = *a++;
713     b += ldb;
714     a += lda;
715   }
716 }
717 static2 void
gridreshape(ctxtp)718 gridreshape(ctxtp)
719   int  *ctxtp;
720 {
721   int   ori, final;	/* original context, and new context created, with
722 			 * line form */
723   int   nprow, npcol, myrow, mycol;
724   int  *usermap;
725   int   i, j;
726   ori = *ctxtp;
727   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
728   usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
729   for (i = 0; i < nprow; i++)
730     for (j = 0; j < npcol; j++) {
731       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
732     }
733   /* Cblacs_get(0, 0, &final); */
734   Cblacs_get(ori, 10, &final);
735   Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
736   *ctxtp = final;
737   free(usermap);
738 }
739