1 #include "redist.h"
2 /** $Id: pitrmr.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $
3   ------------------------------------------------------------------------
4 
5     -- ScaLAPACK routine (version 1.7) --
6        Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of
7        California, Berkeley.
8        October 31, 1994.
9 
10       SUBROUTINE PITRMR2D(UPLO, DIAG, M, N,
11      $                     A, IA, JA, ADESC,
12      $                     B, IB, JB, BDESC,
13      $                     CTXT)
14   ------------------------------------------------------------------------
15     Purpose
16     =======
17 
18     PITRMR2D copies a submatrix of A on a submatrix of B.
19     A and B can have different distributions: they can be on different
20     processor grids, they can have different blocksizes, the beginning
21     of the area to be copied can be at a different places on A and B.
22 
23     The parameters can be confusing when the grids of A and B are
24     partially or completly disjoint, in the case a processor calls
25     this routines but is either not in the A context or B context, the
26     ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the
27     routine recognise this situation.
28     To summarize the rule:
29     - If a processor is in A context, all parameters related to A must be valid.
30     - If a processor is in B context, all parameters related to B must be valid.
31     -  ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1.
32     - M and N must be valid for everyone.
33     - other parameters are not examined.
34 
35     The submatrix to be copied is assumed to be trapezoidal. So only
36     the upper or the lower part will be copied. The other part is
37     unchanged.
38 
39     Notes
40     =====
41 
42     A description vector is associated with each 2D block-cyclicly dis-
43     tributed matrix.  This vector stores the information required to
44     establish the mapping between a matrix entry and its corresponding
45     process and memory location.
46 
47     In the following comments, the character _ should be read as
48     "of the distributed matrix".  Let A be a generic term for any 2D
49     block cyclicly distributed matrix.  Its description vector is DESC_A:
50 
51    NOTATION        STORED IN      EXPLANATION
52    --------------- -------------- --------------------------------------
53    DT_A   (global) DESCA( DT_ )   The descriptor type.
54    CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
55                                   the BLACS process grid A is distribu-
56                                   ted over. The context itself is glo-
57                                   bal, but the handle (the integer
58                                   value) may vary.
59    M_A    (global) DESCA( M_ )    The number of rows in the distributed
60                                   matrix A.
61    N_A    (global) DESCA( N_ )    The number of columns in the distri-
62                                   buted matrix A.
63    MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
64                                   the rows of A.
65    NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
66                                   the columns of A.
67    RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
68                                   row of the matrix A is distributed.
69    CSRC_A (global) DESCA( CSRC_ ) The process column over which the
70                                   first column of A is distributed.
71    LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
72                                   array storing the local blocks of the
73                                   distributed matrix A.
74                                   LLD_A >= MAX(1,LOCp(M_A)).
75 
76 
77 
78     Important notice
79     ================
80      The parameters of the routine have changed in April 1996
81      There is a new last argument. It must be a context englobing
82      all processors involved in the initial and final distribution.
83 
84      Be aware that all processors  included in this
85       context must call the redistribution routine.
86 
87     Parameters
88     ==========
89 
90     UPLO     (input) CHARACTER*1.
91              On entry, UPLO specifies whether we should copy the upper
92              part of the lower part of the defined submatrix:
93                UPLO = 'U' or 'u' copy the upper triangular part.
94                UPLO = 'L' or 'l' copy the lower triangular part.
95              Unchanged on exit.
96 
97     DIAG     (input) CHARACTER*1.
98              On entry, DIAG specifies whether we should copy the diagonal.
99                 DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix.
100                 DIAG = 'N' or 'n' DO copy the diagonal of the submatrix.
101              Unchanged on exit.
102 
103     M        (input) INTEGER.
104              On entry, M specifies the number of rows of the
105              submatrix to be copied.  M must be at least zero.
106              Unchanged on exit.
107 
108     N        (input) INTEGER.
109              On entry, N specifies the number of cols of the submatrix
110              to be redistributed.rows of B.  M must be at least zero.
111              Unchanged on exit.
112 
113     A        (input) INTEGER
114              On entry, the source matrix.
115              Unchanged on exit.
116 
117     IA,JA    (input) INTEGER
118              On entry,the coordinates of the beginning of the submatrix
119              of A to copy.
120              1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1,
121              Unchanged on exit.
122 
123     ADESC    (input) A description vector (see Notes above)
124              If the current processor is not part of the context of A
125              the ADESC[CTXT] must be equal to -1.
126 
127 
128     B        (output) INTEGER
129              On entry, the destination matrix.
130              The portion corresponding to the defined submatrix are updated.
131 
132     IB,JB    (input) INTEGER
133              On entry,the coordinates of the beginning of the submatrix
134              of B that will be updated.
135              1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1,
136              Unchanged on exit.
137 
138     BDESC    (input) B description vector (see Notes above)
139              For processors not part of the context of B
140              BDESC[CTXT] must be equal to -1.
141 
142     CTXT     (input) a context englobing at least all processors included
143                 in either A context or B context
144 
145 
146 
147    Memory requirement :
148    ====================
149 
150    for the processors belonging to grid 0, one buffer of size block 0
151    and for the processors belonging to grid 1, also one buffer of size
152    block 1.
153 
154    ============================================================
155    Created March 1993 by B. Tourancheau (See sccs for modifications).
156    Modifications by Loic PRYLLI 1995
157    ============================================================ */
158 #define static2 static
159 #if defined(Add_) || defined(f77IsF2C)
160 #define fortran_mr2d pitrmr2do_
161 #define fortran_mr2dnew pitrmr2d_
162 #elif defined(UpCase)
163 #define fortran_mr2dnew PITRMR2D
164 #define fortran_mr2d PITRMR2DO
165 #define icopy_ ICOPY
166 #define ilacpy_ ILACPY
167 #else
168 #define fortran_mr2d pitrmr2do
169 #define fortran_mr2dnew pitrmr2d
170 #define icopy_ icopy
171 #define ilacpy_ ilacpy
172 #endif
173 #define Clacpy Citrlacpy
174 void  Clacpy();
175 typedef struct {
176   int   desctype;
177   int   ctxt;
178   int   m;
179   int   n;
180   int   nbrow;
181   int   nbcol;
182   int   sprow;
183   int   spcol;
184   int   lda;
185 }     MDESC;
186 #define BLOCK_CYCLIC_2D 1
187 typedef struct {
188   int   gstart;
189   int   len;
190 }     IDESC;
191 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
192 #define max(A,B) ((A)>(B)?(A):(B))
193 #define min(A,B) ((A)>(B)?(B):(A))
194 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
195 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
196 #ifdef MALLOCDEBUG
197 #define malloc mymalloc
198 #define free myfree
199 #define realloc myrealloc
200 #endif
201 /* Cblacs */
202 extern void Cblacs_pcoord();
203 extern int Cblacs_pnum();
204 extern void Csetpvmtids();
205 extern void Cblacs_get();
206 extern void Cblacs_pinfo();
207 extern void Cblacs_gridinfo();
208 extern void Cblacs_gridinit();
209 extern void Cblacs_exit();
210 extern void Cblacs_gridexit();
211 extern void Cblacs_setup();
212 extern void Cigebs2d();
213 extern void Cigebr2d();
214 extern void Cigesd2d();
215 extern void Cigerv2d();
216 extern void Cigsum2d();
217 extern void Cigamn2d();
218 extern void Cigamx2d();
219 extern void Cigesd2d();
220 extern void Cigerv2d();
221 /* lapack */
222 void  ilacpy_();
223 /* aux fonctions */
224 extern int localindice();
225 extern void *mr2d_malloc();
226 extern int ppcm();
227 extern int localsize();
228 extern int memoryblocksize();
229 extern int changeorigin();
230 extern void paramcheck();
231 /* tools and others function */
232 #define scanD0 itrscanD0
233 #define dispmat itrdispmat
234 #define setmemory itrsetmemory
235 #define freememory itrfreememory
236 #define scan_intervals itrscan_intervals
237 extern void scanD0();
238 extern void dispmat();
239 extern void setmemory();
240 extern void freememory();
241 extern int scan_intervals();
242 extern void Cpitrmr2do();
243 extern void Cpitrmr2d();
244 /* some defines for Cpitrmr2do */
245 #define SENDBUFF 0
246 #define RECVBUFF 1
247 #define SIZEBUFF 2
248 #if 0
249 #define DEBUG
250 #endif
251 #ifndef DEBUG
252 #define NDEBUG
253 #endif
254 #include <stdio.h>
255 #include <stdlib.h>
256 #include <assert.h>
257 #define DESCLEN 9
258 void
fortran_mr2d(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B)259 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A,
260 	     B, ib, jb, desc_B)
261   char *uplo, *diag;
262   int  *ia, *ib, *ja, *jb, *m, *n;
263   int   desc_A[DESCLEN], desc_B[DESCLEN];
264   int  *A, *B;
265 {
266   Cpitrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
267 	     B, *ib, *jb, (MDESC *) desc_B);
268   return;
269 }
270 void
fortran_mr2dnew(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B,gcontext)271 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A,
272 		B, ib, jb, desc_B, gcontext)
273   char *uplo, *diag;
274   int  *ia, *ib, *ja, *jb, *m, *n;
275   int   desc_A[DESCLEN], desc_B[DESCLEN];
276   int  *A, *B;
277   int  *gcontext;
278 {
279   Cpitrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
280 	    B, *ib, *jb, (MDESC *) desc_B, *gcontext);
281   return;
282 }
283 static2 void init_chenille();
284 static2 int inter_len();
285 static2 int block2buff();
286 static2 void buff2block();
287 static2 void gridreshape();
288 void
Cpitrmr2do(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb)289 Cpitrmr2do(uplo, diag, m, n,
290 	   ptrmyblock, ia, ja, ma,
291 	   ptrmynewblock, ib, jb, mb)
292   char *uplo, *diag;
293   int  *ptrmyblock, *ptrmynewblock;
294 /* pointers to the memory location of the matrix and the redistributed matrix */
295   MDESC *ma;
296   MDESC *mb;
297   int   ia, ja, ib, jb, m, n;
298 {
299   int   dummy, nprocs;
300   int   gcontext;
301   /* first we initialize a global grid which serve as a reference to
302    * communicate from grid a to grid b */
303   Cblacs_pinfo(&dummy, &nprocs);
304   Cblacs_get(0, 0, &gcontext);
305   Cblacs_gridinit(&gcontext, "R", 1, nprocs);
306   Cpitrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma,
307 	    ptrmynewblock, ib, jb, mb, gcontext);
308   Cblacs_gridexit(gcontext);
309 }
310 #define NBPARAM 20	/* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
311 			 * idem B puis ia,ja puis ib,jb */
312 #define MAGIC_MAX 100000000
313 void
Cpitrmr2d(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb,globcontext)314 Cpitrmr2d(uplo, diag, m, n,
315 	  ptrmyblock, ia, ja, ma,
316 	  ptrmynewblock, ib, jb, mb, globcontext)
317   char *uplo, *diag;
318   int  *ptrmyblock, *ptrmynewblock;
319 /* pointers to the memory location of the matrix and the redistributed matrix */
320   MDESC *ma;
321   MDESC *mb;
322   int   ia, ja, ib, jb, m, n, globcontext;
323 {
324   int  *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
325   int  *recvptr;
326   MDESC newa, newb;
327   int  *proc0, *proc1, *param;
328   int   mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
329   int   i, j;
330   int   nprow, npcol, gcontext;
331   int   recvsize, sendsize;
332   IDESC *h_inter;	/* to store the horizontal intersections */
333   IDESC *v_inter;	/* to store the vertical intersections */
334   int   hinter_nb, vinter_nb;	/* number of intrsections in both directions */
335   int   dummy;
336   int   p0, q0, p1, q1;
337   int  *ra, *ca;
338   /* end of variables */
339   /* To simplify further calcul we change the matrix indexation from
340    * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
341   if (m == 0 || n == 0)
342     return;
343   ia -= 1;
344   ja -= 1;
345   ib -= 1;
346   jb -= 1;
347   Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
348   gcontext = globcontext;
349   nprocs = nprow * npcol;
350   /* if the global context that is given to us has not the shape of a line
351    * (nprow != 1), create a new context.  TODO: to be optimal, we should
352    * avoid this because it is an uncessary synchronisation */
353   if (nprow != 1) {
354     gridreshape(&gcontext);
355     Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
356   }
357   Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
358   /* compatibility T3D, must check myprow  and mypcol are within bounds */
359   if (myprow0 >= p0 || mypcol0 >= q0)
360     myprow0 = mypcol0 = -1;
361   assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
362   Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
363   if (myprow1 >= p1 || mypcol1 >= q1)
364     myprow1 = mypcol1 = -1;
365   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
366   /* exchange the missing parameters among the processors: shape of grids and
367    * location of the processors */
368   param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
369   ra = param + nprocs * 2 + NBPARAM;
370   ca = param + (nprocs * 2 + NBPARAM) * 2;
371   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
372     param[i] = MAGIC_MAX;
373   proc0 = param + NBPARAM;
374   proc1 = param + NBPARAM + nprocs;
375   /* we calulate proc0 and proc1 that will give the number of a proc in
376    * respectively a or b in the global context */
377   if (myprow0 >= 0) {
378     proc0[myprow0 * q0 + mypcol0] = mypnum;
379     param[0] = p0;
380     param[1] = q0;
381     param[4] = ma->m;
382     param[5] = ma->n;
383     param[6] = ma->nbrow;
384     param[7] = ma->nbcol;
385     param[8] = ma->sprow;
386     param[9] = ma->spcol;
387     param[10] = ia;
388     param[11] = ja;
389   }
390   if (myprow1 >= 0) {
391     proc1[myprow1 * q1 + mypcol1] = mypnum;
392     param[2] = p1;
393     param[3] = q1;
394     param[12] = mb->m;
395     param[13] = mb->n;
396     param[14] = mb->nbrow;
397     param[15] = mb->nbcol;
398     param[16] = mb->sprow;
399     param[17] = mb->spcol;
400     param[18] = ib;
401     param[19] = jb;
402   }
403   Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
404 	   ra, ca, 2 * nprocs + NBPARAM, -1, -1);
405   newa = *ma;
406   newb = *mb;
407   ma = &newa;
408   mb = &newb;
409   if (myprow0 == -1) {
410     p0 = param[0];
411     q0 = param[1];
412     ma->m = param[4];
413     ma->n = param[5];
414     ma->nbrow = param[6];
415     ma->nbcol = param[7];
416     ma->sprow = param[8];
417     ma->spcol = param[9];
418     ia = param[10];
419     ja = param[11];
420   }
421   if (myprow1 == -1) {
422     p1 = param[2];
423     q1 = param[3];
424     mb->m = param[12];
425     mb->n = param[13];
426     mb->nbrow = param[14];
427     mb->nbcol = param[15];
428     mb->sprow = param[16];
429     mb->spcol = param[17];
430     ib = param[18];
431     jb = param[19];
432   }
433   for (i = 0; i < NBPARAM; i++) {
434     if (param[i] == MAGIC_MAX) {
435       fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
436       exit(1);
437     }
438   }
439 #ifndef NDEBUG
440   for (i = 0; i < p0 * q0; i++)
441     assert(proc0[i] >= 0 && proc0[i] < nprocs);
442   for (i = 0; i < p1 * q1; i++)
443     assert(proc1[i] >= 0 && proc1[i] < nprocs);
444 #endif
445   /* check the validity of the parameters */
446   paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
447   paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
448   /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
449   {
450     int   decal;
451     ia = changeorigin(myprow0, ma->sprow, p0,
452 		      ma->nbrow, ia, &decal, &ma->sprow);
453     ptrmyblock += decal;
454     ja = changeorigin(mypcol0, ma->spcol, q0,
455 		      ma->nbcol, ja, &decal, &ma->spcol);
456     ptrmyblock += decal * ma->lda;
457     ma->m = ia + m;
458     ma->n = ja + n;
459     ib = changeorigin(myprow1, mb->sprow, p1,
460 		      mb->nbrow, ib, &decal, &mb->sprow);
461     ptrmynewblock += decal;
462     jb = changeorigin(mypcol1, mb->spcol, q1,
463 		      mb->nbcol, jb, &decal, &mb->spcol);
464     ptrmynewblock += decal * mb->lda;
465     mb->m = ib + m;
466     mb->n = jb + n;
467     if (p0 == 1)
468       ma->nbrow = ma->m;
469     if (q0 == 1)
470       ma->nbcol = ma->n;
471     if (p1 == 1)
472       mb->nbrow = mb->m;
473     if (q1 == 1)
474       mb->nbcol = mb->n;
475 #ifndef NDEBUG
476     paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
477     paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
478 #endif
479   }
480   /* We compute the size of the memory buffer ( we choose the worst case,
481    * when the buffer sizes == the memory block sizes). */
482   if (myprow0 >= 0 && mypcol0 >= 0) {
483     /* Initialize pointer variables */
484     setmemory(&ptrsendbuff, memoryblocksize(ma));
485   };	/* if (mypnum < p0 * q0) */
486   if (myprow1 >= 0 && mypcol1 >= 0) {
487     /* Initialize pointer variables */
488     setmemory(&ptrrecvbuff, memoryblocksize(mb));
489   };	/* if (mypnum < p1 * q1) */
490   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
491    * intervals, in fact the worst case should be less, perhaps half that,I
492    * should think of that one day. */
493   h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
494 				  ma->nbcol * sizeof(IDESC));
495   v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
496 				  * ma->nbrow * sizeof(IDESC));
497   /* We go for the scanning of indices. For each processor including mypnum,
498    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
499    * it. Then for each processor, we compute the size of message to be
500    * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
501    * of recvbuff the right place (scanD)(RECVBUFF)) */
502   recvptr = ptrrecvbuff;
503   {
504     int   tot, myrang, step, sens;
505     int  *sender, *recver;
506     int   mesending, merecving;
507     tot = max(p0 * q0, p1 * q1);
508     init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
509 		  &sender, &recver, &myrang);
510     if (myrang == -1)
511       goto after_comm;
512     mesending = myprow0 >= 0;
513     assert(sender[myrang] >= 0 || !mesending);
514     assert(!mesending || proc0[sender[myrang]] == mypnum);
515     merecving = myprow1 >= 0;
516     assert(recver[myrang] >= 0 || !merecving);
517     assert(!merecving || proc1[recver[myrang]] == mypnum);
518     step = tot - 1 - myrang;
519     do {
520       for (sens = 0; sens < 2; sens++) {
521 	/* be careful here, when we communicating with ourselves, we must
522 	 * send first (myrang > step == 0) */
523 	if (mesending && recver[step] >= 0 &&
524 	    (sens == 0)) {
525 	  i = recver[step] / q1;
526 	  j = recver[step] % q1;
527 	  vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
528 				     v_inter);
529 	  hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
530 				     h_inter);
531 	  scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize,
532 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
533 		 v_inter, vinter_nb, h_inter, hinter_nb,
534 		 ptrmyblock);
535 	}	/* if (mesending...) { */
536 	if (mesending && recver[step] >= 0 &&
537 	    (sens == myrang > step)) {
538 	  i = recver[step] / q1;
539 	  j = recver[step] % q1;
540 	  if (sendsize > 0
541 	      && (step != myrang || !merecving)
542 		) {
543 	    Cigesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
544 		     0, proc1[i * q1 + j]);
545 	  }	/* sendsize > 0 */
546 	}	/* if (mesending ... */
547 	if (merecving && sender[step] >= 0 &&
548 	    (sens == myrang <= step)) {
549 	  i = sender[step] / q0;
550 	  j = sender[step] % q0;
551 	  vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
552 				     v_inter);
553 	  hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
554 				     h_inter);
555 	  scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize,
556 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
557 		 v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL);
558 	  if (recvsize > 0) {
559 	    if (step == myrang && mesending) {
560 	      Clacpy(recvsize, 1,
561 		     ptrsendbuff, recvsize,
562 		     ptrrecvbuff, recvsize);
563 	    } else {
564 	      Cigerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
565 		       0, proc0[i * q0 + j]);
566 	    }
567 	  }	/* recvsize > 0 */
568 	}	/* if (merecving ...) */
569 	if (merecving && sender[step] >= 0 && sens == 1) {
570 	  scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize,
571 		 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
572 		 v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock);
573 	}	/* if (merecving...)  */
574       }	/* for (sens = 0) */
575       step -= 1;
576       if (step < 0)
577 	step = tot - 1;
578     } while (step != tot - 1 - myrang);
579 after_comm:
580     free(sender);
581   }	/* { int tot,nr,ns ...} */
582   /* don't forget to clean up things! */
583   if (myprow1 >= 0 && mypcol1 >= 0) {
584     freememory((char *) ptrrecvbuff);
585   };
586   if (myprow0 >= 0 && mypcol0 >= 0) {
587     freememory((char *) ptrsendbuff);
588   };
589   if (nprow != 1)
590     Cblacs_gridexit(gcontext);
591   free(v_inter);
592   free(h_inter);
593   free(param);
594 }/* distrib */
595 static2 void
init_chenille(mypnum,nprocs,n0,proc0,n1,proc1,psend,precv,myrang)596 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
597   int   nprocs, mypnum, n0, n1;
598   int  *proc0, *proc1, **psend, **precv, *myrang;
599 {
600   int   ns, nr, i, tot;
601   int  *sender, *recver, *g0, *g1;
602   tot = max(n0, n1);
603   sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
604   recver = sender + tot;
605   *psend = sender;
606   *precv = recver;
607   g0 = recver + tot;
608   g1 = g0 + nprocs;
609   for (i = 0; i < nprocs; i++) {
610     g0[i] = -1;
611     g1[i] = -1;
612   }
613   for (i = 0; i < tot; i++) {
614     sender[i] = -1;
615     recver[i] = -1;
616   }
617   for (i = 0; i < n0; i++)
618     g0[proc0[i]] = i;
619   for (i = 0; i < n1; i++)
620     g1[proc1[i]] = i;
621   ns = 0;
622   nr = 0;
623   *myrang = -1;
624   for (i = 0; i < nprocs; i++)
625     if (g0[i] >= 0 && g1[i] >= 0) {
626       if (i == mypnum)
627 	*myrang = nr;
628       sender[ns] = g0[i];
629       ns += 1;
630       recver[nr] = g1[i];
631       nr += 1;
632       assert(ns <= n0 && nr <= n1 && nr == ns);
633     }
634   for (i = 0; i < nprocs; i++)
635     if (g0[i] >= 0 && g1[i] < 0) {
636       if (i == mypnum)
637 	*myrang = ns;
638       sender[ns] = g0[i];
639       ns += 1;
640       assert(ns <= n0);
641     }
642   for (i = 0; i < nprocs; i++)
643     if (g1[i] >= 0 && g0[i] < 0) {
644       if (i == mypnum)
645 	*myrang = nr;
646       recver[nr] = g1[i];
647       nr += 1;
648       assert(nr <= n1);
649     }
650 }
651 void
Clacpy(m,n,a,lda,b,ldb)652 Clacpy(m, n, a, lda, b, ldb)
653   int  *a, *b;
654   int   m, n, lda, ldb;
655 {
656   int   i, j;
657   lda -= m;
658   ldb -= m;
659   assert(lda >= 0 && ldb >= 0);
660   for (j = 0; j < n; j++) {
661     for (i = 0; i < m; i++)
662       *b++ = *a++;
663     b += ldb;
664     a += lda;
665   }
666 }
667 static2 void
gridreshape(ctxtp)668 gridreshape(ctxtp)
669   int  *ctxtp;
670 {
671   int   ori, final;	/* original context, and new context created, with
672 			 * line form */
673   int   nprow, npcol, myrow, mycol;
674   int  *usermap;
675   int   i, j;
676   ori = *ctxtp;
677   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
678   usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
679   for (i = 0; i < nprow; i++)
680     for (j = 0; j < npcol; j++) {
681       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
682     }
683   /* Cblacs_get(0, 0, &final); */
684   Cblacs_get(ori, 10, &final);
685   Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
686   *ctxtp = final;
687   free(usermap);
688 }
689