1 #include "redist.h"
2 /** $Id: pctrmr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $
3   ------------------------------------------------------------------------
4 
5     -- ScaLAPACK routine (version 1.7) --
6        Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of
7        California, Berkeley.
8        October 31, 1994.
9 
10       SUBROUTINE PCTRMR2D(UPLO, DIAG, M, N,
11      $                     A, IA, JA, ADESC,
12      $                     B, IB, JB, BDESC,
13      $                     CTXT)
14   ------------------------------------------------------------------------
15     Purpose
16     =======
17 
18     PCTRMR2D copies a submatrix of A on a submatrix of B.
19     A and B can have different distributions: they can be on different
20     processor grids, they can have different blocksizes, the beginning
21     of the area to be copied can be at a different places on A and B.
22 
23     The parameters can be confusing when the grids of A and B are
24     partially or completly disjoint, in the case a processor calls
25     this routines but is either not in the A context or B context, the
26     ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the
27     routine recognise this situation.
28     To summarize the rule:
29     - If a processor is in A context, all parameters related to A must be valid.
30     - If a processor is in B context, all parameters related to B must be valid.
31     -  ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1.
32     - M and N must be valid for everyone.
33     - other parameters are not examined.
34 
35     The submatrix to be copied is assumed to be trapezoidal. So only
36     the upper or the lower part will be copied. The other part is
37     unchanged.
38 
39     Notes
40     =====
41 
42     A description vector is associated with each 2D block-cyclicly dis-
43     tributed matrix.  This vector stores the information required to
44     establish the mapping between a matrix entry and its corresponding
45     process and memory location.
46 
47     In the following comments, the character _ should be read as
48     "of the distributed matrix".  Let A be a generic term for any 2D
49     block cyclicly distributed matrix.  Its description vector is DESC_A:
50 
51    NOTATION        STORED IN      EXPLANATION
52    --------------- -------------- --------------------------------------
53    DT_A   (global) DESCA( DT_ )   The descriptor type.
54    CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
55                                   the BLACS process grid A is distribu-
56                                   ted over. The context itself is glo-
57                                   bal, but the handle (the integer
58                                   value) may vary.
59    M_A    (global) DESCA( M_ )    The number of rows in the distributed
60                                   matrix A.
61    N_A    (global) DESCA( N_ )    The number of columns in the distri-
62                                   buted matrix A.
63    MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
64                                   the rows of A.
65    NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
66                                   the columns of A.
67    RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
68                                   row of the matrix A is distributed.
69    CSRC_A (global) DESCA( CSRC_ ) The process column over which the
70                                   first column of A is distributed.
71    LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
72                                   array storing the local blocks of the
73                                   distributed matrix A.
74                                   LLD_A >= MAX(1,LOCp(M_A)).
75 
76 
77 
78     Important notice
79     ================
80      The parameters of the routine have changed in April 1996
81      There is a new last argument. It must be a context englobing
82      all processors involved in the initial and final distribution.
83 
84      Be aware that all processors  included in this
85       context must call the redistribution routine.
86 
87     Parameters
88     ==========
89 
90     UPLO     (input) CHARACTER*1.
91              On entry, UPLO specifies whether we should copy the upper
92              part of the lower part of the defined submatrix:
93                UPLO = 'U' or 'u' copy the upper triangular part.
94                UPLO = 'L' or 'l' copy the lower triangular part.
95              Unchanged on exit.
96 
97     DIAG     (input) CHARACTER*1.
98              On entry, DIAG specifies whether we should copy the diagonal.
99                 DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix.
100                 DIAG = 'N' or 'n' DO copy the diagonal of the submatrix.
101              Unchanged on exit.
102 
103     M        (input) INTEGER.
104              On entry, M specifies the number of rows of the
105              submatrix to be copied.  M must be at least zero.
106              Unchanged on exit.
107 
108     N        (input) INTEGER.
109              On entry, N specifies the number of cols of the submatrix
110              to be redistributed.rows of B.  M must be at least zero.
111              Unchanged on exit.
112 
113     A        (input) COMPLEX
114              On entry, the source matrix.
115              Unchanged on exit.
116 
117     IA,JA    (input) INTEGER
118              On entry,the coordinates of the beginning of the submatrix
119              of A to copy.
120              1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1,
121              Unchanged on exit.
122 
123     ADESC    (input) A description vector (see Notes above)
124              If the current processor is not part of the context of A
125              the ADESC[CTXT] must be equal to -1.
126 
127 
128     B        (output) COMPLEX
129              On entry, the destination matrix.
130              The portion corresponding to the defined submatrix are updated.
131 
132     IB,JB    (input) INTEGER
133              On entry,the coordinates of the beginning of the submatrix
134              of B that will be updated.
135              1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1,
136              Unchanged on exit.
137 
138     BDESC    (input) B description vector (see Notes above)
139              For processors not part of the context of B
140              BDESC[CTXT] must be equal to -1.
141 
142     CTXT     (input) a context englobing at least all processors included
143                 in either A context or B context
144 
145 
146 
147    Memory requirement :
148    ====================
149 
150    for the processors belonging to grid 0, one buffer of size block 0
151    and for the processors belonging to grid 1, also one buffer of size
152    block 1.
153 
154    ============================================================
155    Created March 1993 by B. Tourancheau (See sccs for modifications).
156    Modifications by Loic PRYLLI 1995
157    ============================================================ */
158 #define static2 static
159 #if defined(Add_) || defined(f77IsF2C)
160 #define fortran_mr2d pctrmr2do_
161 #define fortran_mr2dnew pctrmr2d_
162 #elif defined(UpCase)
163 #define fortran_mr2dnew PCTRMR2D
164 #define fortran_mr2d PCTRMR2DO
165 #define ccopy_ CCOPY
166 #define clacpy_ CLACPY
167 #else
168 #define fortran_mr2d pctrmr2do
169 #define fortran_mr2dnew pctrmr2d
170 #define ccopy_ ccopy
171 #define clacpy_ clacpy
172 #endif
173 #define Clacpy Cctrlacpy
174 void  Clacpy();
175 typedef struct {
176   float r, i;
177 }     complex;
178 typedef struct {
179   int   desctype;
180   int   ctxt;
181   int   m;
182   int   n;
183   int   nbrow;
184   int   nbcol;
185   int   sprow;
186   int   spcol;
187   int   lda;
188 }     MDESC;
189 #define BLOCK_CYCLIC_2D 1
190 typedef struct {
191   int   gstart;
192   int   len;
193 }     IDESC;
194 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
195 #define max(A,B) ((A)>(B)?(A):(B))
196 #define min(A,B) ((A)>(B)?(B):(A))
197 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
198 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
199 #ifdef MALLOCDEBUG
200 #define malloc mymalloc
201 #define free myfree
202 #define realloc myrealloc
203 #endif
204 /* Cblacs */
205 extern void Cblacs_pcoord();
206 extern int Cblacs_pnum();
207 extern void Csetpvmtids();
208 extern void Cblacs_get();
209 extern void Cblacs_pinfo();
210 extern void Cblacs_gridinfo();
211 extern void Cblacs_gridinit();
212 extern void Cblacs_exit();
213 extern void Cblacs_gridexit();
214 extern void Cblacs_setup();
215 extern void Cigebs2d();
216 extern void Cigebr2d();
217 extern void Cigesd2d();
218 extern void Cigerv2d();
219 extern void Cigsum2d();
220 extern void Cigamn2d();
221 extern void Cigamx2d();
222 extern void Ccgesd2d();
223 extern void Ccgerv2d();
224 /* lapack */
225 void  clacpy_();
226 /* aux fonctions */
227 extern int localindice();
228 extern void *mr2d_malloc();
229 extern int ppcm();
230 extern int localsize();
231 extern int memoryblocksize();
232 extern int changeorigin();
233 extern void paramcheck();
234 /* tools and others function */
235 #define scanD0 ctrscanD0
236 #define dispmat ctrdispmat
237 #define setmemory ctrsetmemory
238 #define freememory ctrfreememory
239 #define scan_intervals ctrscan_intervals
240 extern void scanD0();
241 extern void dispmat();
242 extern void setmemory();
243 extern void freememory();
244 extern int scan_intervals();
245 extern void Cpctrmr2do();
246 extern void Cpctrmr2d();
247 /* some defines for Cpctrmr2do */
248 #define SENDBUFF 0
249 #define RECVBUFF 1
250 #define SIZEBUFF 2
251 #if 0
252 #define DEBUG
253 #endif
254 #ifndef DEBUG
255 #define NDEBUG
256 #endif
257 #include <stdio.h>
258 #include <stdlib.h>
259 #include <assert.h>
260 #define DESCLEN 9
261 void
fortran_mr2d(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B)262 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A,
263 	     B, ib, jb, desc_B)
264   char *uplo, *diag;
265   int  *ia, *ib, *ja, *jb, *m, *n;
266   int   desc_A[DESCLEN], desc_B[DESCLEN];
267   complex *A, *B;
268 {
269   Cpctrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
270 	     B, *ib, *jb, (MDESC *) desc_B);
271   return;
272 }
273 void
fortran_mr2dnew(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B,gcontext)274 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A,
275 		B, ib, jb, desc_B, gcontext)
276   char *uplo, *diag;
277   int  *ia, *ib, *ja, *jb, *m, *n;
278   int   desc_A[DESCLEN], desc_B[DESCLEN];
279   complex *A, *B;
280   int  *gcontext;
281 {
282   Cpctrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
283 	    B, *ib, *jb, (MDESC *) desc_B, *gcontext);
284   return;
285 }
286 static2 void init_chenille();
287 static2 int inter_len();
288 static2 int block2buff();
289 static2 void buff2block();
290 static2 void gridreshape();
291 void
Cpctrmr2do(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb)292 Cpctrmr2do(uplo, diag, m, n,
293 	   ptrmyblock, ia, ja, ma,
294 	   ptrmynewblock, ib, jb, mb)
295   char *uplo, *diag;
296   complex *ptrmyblock, *ptrmynewblock;
297 /* pointers to the memory location of the matrix and the redistributed matrix */
298   MDESC *ma;
299   MDESC *mb;
300   int   ia, ja, ib, jb, m, n;
301 {
302   int   dummy, nprocs;
303   int   gcontext;
304   /* first we initialize a global grid which serve as a reference to
305    * communicate from grid a to grid b */
306   Cblacs_pinfo(&dummy, &nprocs);
307   Cblacs_get(0, 0, &gcontext);
308   Cblacs_gridinit(&gcontext, "R", 1, nprocs);
309   Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma,
310 	    ptrmynewblock, ib, jb, mb, gcontext);
311   Cblacs_gridexit(gcontext);
312 }
313 #define NBPARAM 20	/* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
314 			 * idem B puis ia,ja puis ib,jb */
315 #define MAGIC_MAX 100000000
316 void
Cpctrmr2d(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb,globcontext)317 Cpctrmr2d(uplo, diag, m, n,
318 	  ptrmyblock, ia, ja, ma,
319 	  ptrmynewblock, ib, jb, mb, globcontext)
320   char *uplo, *diag;
321   complex *ptrmyblock, *ptrmynewblock;
322 /* pointers to the memory location of the matrix and the redistributed matrix */
323   MDESC *ma;
324   MDESC *mb;
325   int   ia, ja, ib, jb, m, n, globcontext;
326 {
327   complex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
328   complex *recvptr;
329   MDESC newa, newb;
330   int  *proc0, *proc1, *param;
331   int   mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
332   int   i, j;
333   int   nprow, npcol, gcontext;
334   int   recvsize, sendsize;
335   IDESC *h_inter;	/* to store the horizontal intersections */
336   IDESC *v_inter;	/* to store the vertical intersections */
337   int   hinter_nb, vinter_nb;	/* number of intrsections in both directions */
338   int   dummy;
339   int   p0, q0, p1, q1;
340   int  *ra, *ca;
341   /* end of variables */
342   /* To simplify further calcul we change the matrix indexation from
343    * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
344   if (m == 0 || n == 0)
345     return;
346   ia -= 1;
347   ja -= 1;
348   ib -= 1;
349   jb -= 1;
350   Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
351   gcontext = globcontext;
352   nprocs = nprow * npcol;
353   /* if the global context that is given to us has not the shape of a line
354    * (nprow != 1), create a new context.  TODO: to be optimal, we should
355    * avoid this because it is an uncessary synchronisation */
356   if (nprow != 1) {
357     gridreshape(&gcontext);
358     Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
359   }
360   Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
361   /* compatibility T3D, must check myprow  and mypcol are within bounds */
362   if (myprow0 >= p0 || mypcol0 >= q0)
363     myprow0 = mypcol0 = -1;
364   assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
365   Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
366   if (myprow1 >= p1 || mypcol1 >= q1)
367     myprow1 = mypcol1 = -1;
368   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
369   /* exchange the missing parameters among the processors: shape of grids and
370    * location of the processors */
371   param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
372   ra = param + nprocs * 2 + NBPARAM;
373   ca = param + (nprocs * 2 + NBPARAM) * 2;
374   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
375     param[i] = MAGIC_MAX;
376   proc0 = param + NBPARAM;
377   proc1 = param + NBPARAM + nprocs;
378   /* we calulate proc0 and proc1 that will give the number of a proc in
379    * respectively a or b in the global context */
380   if (myprow0 >= 0) {
381     proc0[myprow0 * q0 + mypcol0] = mypnum;
382     param[0] = p0;
383     param[1] = q0;
384     param[4] = ma->m;
385     param[5] = ma->n;
386     param[6] = ma->nbrow;
387     param[7] = ma->nbcol;
388     param[8] = ma->sprow;
389     param[9] = ma->spcol;
390     param[10] = ia;
391     param[11] = ja;
392   }
393   if (myprow1 >= 0) {
394     proc1[myprow1 * q1 + mypcol1] = mypnum;
395     param[2] = p1;
396     param[3] = q1;
397     param[12] = mb->m;
398     param[13] = mb->n;
399     param[14] = mb->nbrow;
400     param[15] = mb->nbcol;
401     param[16] = mb->sprow;
402     param[17] = mb->spcol;
403     param[18] = ib;
404     param[19] = jb;
405   }
406   Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
407 	   ra, ca, 2 * nprocs + NBPARAM, -1, -1);
408   newa = *ma;
409   newb = *mb;
410   ma = &newa;
411   mb = &newb;
412   if (myprow0 == -1) {
413     p0 = param[0];
414     q0 = param[1];
415     ma->m = param[4];
416     ma->n = param[5];
417     ma->nbrow = param[6];
418     ma->nbcol = param[7];
419     ma->sprow = param[8];
420     ma->spcol = param[9];
421     ia = param[10];
422     ja = param[11];
423   }
424   if (myprow1 == -1) {
425     p1 = param[2];
426     q1 = param[3];
427     mb->m = param[12];
428     mb->n = param[13];
429     mb->nbrow = param[14];
430     mb->nbcol = param[15];
431     mb->sprow = param[16];
432     mb->spcol = param[17];
433     ib = param[18];
434     jb = param[19];
435   }
436   for (i = 0; i < NBPARAM; i++) {
437     if (param[i] == MAGIC_MAX) {
438       fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
439       exit(1);
440     }
441   }
442 #ifndef NDEBUG
443   for (i = 0; i < p0 * q0; i++)
444     assert(proc0[i] >= 0 && proc0[i] < nprocs);
445   for (i = 0; i < p1 * q1; i++)
446     assert(proc1[i] >= 0 && proc1[i] < nprocs);
447 #endif
448   /* check the validity of the parameters */
449   paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
450   paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
451   /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
452   {
453     int   decal;
454     ia = changeorigin(myprow0, ma->sprow, p0,
455 		      ma->nbrow, ia, &decal, &ma->sprow);
456     ptrmyblock += decal;
457     ja = changeorigin(mypcol0, ma->spcol, q0,
458 		      ma->nbcol, ja, &decal, &ma->spcol);
459     ptrmyblock += decal * ma->lda;
460     ma->m = ia + m;
461     ma->n = ja + n;
462     ib = changeorigin(myprow1, mb->sprow, p1,
463 		      mb->nbrow, ib, &decal, &mb->sprow);
464     ptrmynewblock += decal;
465     jb = changeorigin(mypcol1, mb->spcol, q1,
466 		      mb->nbcol, jb, &decal, &mb->spcol);
467     ptrmynewblock += decal * mb->lda;
468     mb->m = ib + m;
469     mb->n = jb + n;
470     if (p0 == 1)
471       ma->nbrow = ma->m;
472     if (q0 == 1)
473       ma->nbcol = ma->n;
474     if (p1 == 1)
475       mb->nbrow = mb->m;
476     if (q1 == 1)
477       mb->nbcol = mb->n;
478 #ifndef NDEBUG
479     paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
480     paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
481 #endif
482   }
483   /* We compute the size of the memory buffer ( we choose the worst case,
484    * when the buffer sizes == the memory block sizes). */
485   if (myprow0 >= 0 && mypcol0 >= 0) {
486     /* Initialize pointer variables */
487     setmemory(&ptrsendbuff, memoryblocksize(ma));
488   };	/* if (mypnum < p0 * q0) */
489   if (myprow1 >= 0 && mypcol1 >= 0) {
490     /* Initialize pointer variables */
491     setmemory(&ptrrecvbuff, memoryblocksize(mb));
492   };	/* if (mypnum < p1 * q1) */
493   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
494    * intervals, in fact the worst case should be less, perhaps half that,I
495    * should think of that one day. */
496   h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
497 				  ma->nbcol * sizeof(IDESC));
498   v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
499 				  * ma->nbrow * sizeof(IDESC));
500   /* We go for the scanning of indices. For each processor including mypnum,
501    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
502    * it. Then for each processor, we compute the size of message to be
503    * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
504    * of recvbuff the right place (scanD)(RECVBUFF)) */
505   recvptr = ptrrecvbuff;
506   {
507     int   tot, myrang, step, sens;
508     int  *sender, *recver;
509     int   mesending, merecving;
510     tot = max(p0 * q0, p1 * q1);
511     init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
512 		  &sender, &recver, &myrang);
513     if (myrang == -1)
514       goto after_comm;
515     mesending = myprow0 >= 0;
516     assert(sender[myrang] >= 0 || !mesending);
517     assert(!mesending || proc0[sender[myrang]] == mypnum);
518     merecving = myprow1 >= 0;
519     assert(recver[myrang] >= 0 || !merecving);
520     assert(!merecving || proc1[recver[myrang]] == mypnum);
521     step = tot - 1 - myrang;
522     do {
523       for (sens = 0; sens < 2; sens++) {
524 	/* be careful here, when we communicating with ourselves, we must
525 	 * send first (myrang > step == 0) */
526 	if (mesending && recver[step] >= 0 &&
527 	    (sens == 0)) {
528 	  i = recver[step] / q1;
529 	  j = recver[step] % q1;
530 	  vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
531 				     v_inter);
532 	  hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
533 				     h_inter);
534 	  scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize,
535 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
536 		 v_inter, vinter_nb, h_inter, hinter_nb,
537 		 ptrmyblock);
538 	}	/* if (mesending...) { */
539 	if (mesending && recver[step] >= 0 &&
540 	    (sens == myrang > step)) {
541 	  i = recver[step] / q1;
542 	  j = recver[step] % q1;
543 	  if (sendsize > 0
544 	      && (step != myrang || !merecving)
545 		) {
546 	    Ccgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
547 		     0, proc1[i * q1 + j]);
548 	  }	/* sendsize > 0 */
549 	}	/* if (mesending ... */
550 	if (merecving && sender[step] >= 0 &&
551 	    (sens == myrang <= step)) {
552 	  i = sender[step] / q0;
553 	  j = sender[step] % q0;
554 	  vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
555 				     v_inter);
556 	  hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
557 				     h_inter);
558 	  scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize,
559 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
560 		 v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL);
561 	  if (recvsize > 0) {
562 	    if (step == myrang && mesending) {
563 	      Clacpy(recvsize, 1,
564 		     ptrsendbuff, recvsize,
565 		     ptrrecvbuff, recvsize);
566 	    } else {
567 	      Ccgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
568 		       0, proc0[i * q0 + j]);
569 	    }
570 	  }	/* recvsize > 0 */
571 	}	/* if (merecving ...) */
572 	if (merecving && sender[step] >= 0 && sens == 1) {
573 	  scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize,
574 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
575 		 v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock);
576 	}	/* if (merecving...)  */
577       }	/* for (sens = 0) */
578       step -= 1;
579       if (step < 0)
580 	step = tot - 1;
581     } while (step != tot - 1 - myrang);
582 after_comm:
583     free(sender);
584   }	/* { int tot,nr,ns ...} */
585   /* don't forget to clean up things! */
586   if (myprow1 >= 0 && mypcol1 >= 0) {
587     freememory((char *) ptrrecvbuff);
588   };
589   if (myprow0 >= 0 && mypcol0 >= 0) {
590     freememory((char *) ptrsendbuff);
591   };
592   if (nprow != 1)
593     Cblacs_gridexit(gcontext);
594   free(v_inter);
595   free(h_inter);
596   free(param);
597 }/* distrib */
598 static2 void
init_chenille(mypnum,nprocs,n0,proc0,n1,proc1,psend,precv,myrang)599 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
600   int   nprocs, mypnum, n0, n1;
601   int  *proc0, *proc1, **psend, **precv, *myrang;
602 {
603   int   ns, nr, i, tot;
604   int  *sender, *recver, *g0, *g1;
605   tot = max(n0, n1);
606   sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
607   recver = sender + tot;
608   *psend = sender;
609   *precv = recver;
610   g0 = recver + tot;
611   g1 = g0 + nprocs;
612   for (i = 0; i < nprocs; i++) {
613     g0[i] = -1;
614     g1[i] = -1;
615   }
616   for (i = 0; i < tot; i++) {
617     sender[i] = -1;
618     recver[i] = -1;
619   }
620   for (i = 0; i < n0; i++)
621     g0[proc0[i]] = i;
622   for (i = 0; i < n1; i++)
623     g1[proc1[i]] = i;
624   ns = 0;
625   nr = 0;
626   *myrang = -1;
627   for (i = 0; i < nprocs; i++)
628     if (g0[i] >= 0 && g1[i] >= 0) {
629       if (i == mypnum)
630 	*myrang = nr;
631       sender[ns] = g0[i];
632       ns += 1;
633       recver[nr] = g1[i];
634       nr += 1;
635       assert(ns <= n0 && nr <= n1 && nr == ns);
636     }
637   for (i = 0; i < nprocs; i++)
638     if (g0[i] >= 0 && g1[i] < 0) {
639       if (i == mypnum)
640 	*myrang = ns;
641       sender[ns] = g0[i];
642       ns += 1;
643       assert(ns <= n0);
644     }
645   for (i = 0; i < nprocs; i++)
646     if (g1[i] >= 0 && g0[i] < 0) {
647       if (i == mypnum)
648 	*myrang = nr;
649       recver[nr] = g1[i];
650       nr += 1;
651       assert(nr <= n1);
652     }
653 }
654 void
Clacpy(m,n,a,lda,b,ldb)655 Clacpy(m, n, a, lda, b, ldb)
656   complex *a, *b;
657   int   m, n, lda, ldb;
658 {
659   int   i, j;
660   lda -= m;
661   ldb -= m;
662   assert(lda >= 0 && ldb >= 0);
663   for (j = 0; j < n; j++) {
664     for (i = 0; i < m; i++)
665       *b++ = *a++;
666     b += ldb;
667     a += lda;
668   }
669 }
670 static2 void
gridreshape(ctxtp)671 gridreshape(ctxtp)
672   int  *ctxtp;
673 {
674   int   ori, final;	/* original context, and new context created, with
675 			 * line form */
676   int   nprow, npcol, myrow, mycol;
677   int  *usermap;
678   int   i, j;
679   ori = *ctxtp;
680   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
681   usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
682   for (i = 0; i < nprow; i++)
683     for (j = 0; j < npcol; j++) {
684       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
685     }
686   /* Cblacs_get(0, 0, &final); */
687   Cblacs_get(ori, 10, &final);
688   Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
689   *ctxtp = final;
690   free(usermap);
691 }
692