1 #include "redist.h"
2 /** $Id: pctrmr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $
3 ------------------------------------------------------------------------
4
5 -- ScaLAPACK routine (version 1.7) --
6 Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of
7 California, Berkeley.
8 October 31, 1994.
9
10 SUBROUTINE PCTRMR2D(UPLO, DIAG, M, N,
11 $ A, IA, JA, ADESC,
12 $ B, IB, JB, BDESC,
13 $ CTXT)
14 ------------------------------------------------------------------------
15 Purpose
16 =======
17
18 PCTRMR2D copies a submatrix of A on a submatrix of B.
19 A and B can have different distributions: they can be on different
20 processor grids, they can have different blocksizes, the beginning
21 of the area to be copied can be at a different places on A and B.
22
23 The parameters can be confusing when the grids of A and B are
24 partially or completly disjoint, in the case a processor calls
25 this routines but is either not in the A context or B context, the
26 ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the
27 routine recognise this situation.
28 To summarize the rule:
29 - If a processor is in A context, all parameters related to A must be valid.
30 - If a processor is in B context, all parameters related to B must be valid.
31 - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1.
32 - M and N must be valid for everyone.
33 - other parameters are not examined.
34
35 The submatrix to be copied is assumed to be trapezoidal. So only
36 the upper or the lower part will be copied. The other part is
37 unchanged.
38
39 Notes
40 =====
41
42 A description vector is associated with each 2D block-cyclicly dis-
43 tributed matrix. This vector stores the information required to
44 establish the mapping between a matrix entry and its corresponding
45 process and memory location.
46
47 In the following comments, the character _ should be read as
48 "of the distributed matrix". Let A be a generic term for any 2D
49 block cyclicly distributed matrix. Its description vector is DESC_A:
50
51 NOTATION STORED IN EXPLANATION
52 --------------- -------------- --------------------------------------
53 DT_A (global) DESCA( DT_ ) The descriptor type.
54 CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
55 the BLACS process grid A is distribu-
56 ted over. The context itself is glo-
57 bal, but the handle (the integer
58 value) may vary.
59 M_A (global) DESCA( M_ ) The number of rows in the distributed
60 matrix A.
61 N_A (global) DESCA( N_ ) The number of columns in the distri-
62 buted matrix A.
63 MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
64 the rows of A.
65 NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
66 the columns of A.
67 RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
68 row of the matrix A is distributed.
69 CSRC_A (global) DESCA( CSRC_ ) The process column over which the
70 first column of A is distributed.
71 LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
72 array storing the local blocks of the
73 distributed matrix A.
74 LLD_A >= MAX(1,LOCp(M_A)).
75
76
77
78 Important notice
79 ================
80 The parameters of the routine have changed in April 1996
81 There is a new last argument. It must be a context englobing
82 all processors involved in the initial and final distribution.
83
84 Be aware that all processors included in this
85 context must call the redistribution routine.
86
87 Parameters
88 ==========
89
90 UPLO (input) CHARACTER*1.
91 On entry, UPLO specifies whether we should copy the upper
92 part of the lower part of the defined submatrix:
93 UPLO = 'U' or 'u' copy the upper triangular part.
94 UPLO = 'L' or 'l' copy the lower triangular part.
95 Unchanged on exit.
96
97 DIAG (input) CHARACTER*1.
98 On entry, DIAG specifies whether we should copy the diagonal.
99 DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix.
100 DIAG = 'N' or 'n' DO copy the diagonal of the submatrix.
101 Unchanged on exit.
102
103 M (input) INTEGER.
104 On entry, M specifies the number of rows of the
105 submatrix to be copied. M must be at least zero.
106 Unchanged on exit.
107
108 N (input) INTEGER.
109 On entry, N specifies the number of cols of the submatrix
110 to be redistributed.rows of B. M must be at least zero.
111 Unchanged on exit.
112
113 A (input) COMPLEX
114 On entry, the source matrix.
115 Unchanged on exit.
116
117 IA,JA (input) INTEGER
118 On entry,the coordinates of the beginning of the submatrix
119 of A to copy.
120 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1,
121 Unchanged on exit.
122
123 ADESC (input) A description vector (see Notes above)
124 If the current processor is not part of the context of A
125 the ADESC[CTXT] must be equal to -1.
126
127
128 B (output) COMPLEX
129 On entry, the destination matrix.
130 The portion corresponding to the defined submatrix are updated.
131
132 IB,JB (input) INTEGER
133 On entry,the coordinates of the beginning of the submatrix
134 of B that will be updated.
135 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1,
136 Unchanged on exit.
137
138 BDESC (input) B description vector (see Notes above)
139 For processors not part of the context of B
140 BDESC[CTXT] must be equal to -1.
141
142 CTXT (input) a context englobing at least all processors included
143 in either A context or B context
144
145
146
147 Memory requirement :
148 ====================
149
150 for the processors belonging to grid 0, one buffer of size block 0
151 and for the processors belonging to grid 1, also one buffer of size
152 block 1.
153
154 ============================================================
155 Created March 1993 by B. Tourancheau (See sccs for modifications).
156 Modifications by Loic PRYLLI 1995
157 ============================================================ */
158 #define static2 static
159 #if defined(Add_) || defined(f77IsF2C)
160 #define fortran_mr2d pctrmr2do_
161 #define fortran_mr2dnew pctrmr2d_
162 #elif defined(UpCase)
163 #define fortran_mr2dnew PCTRMR2D
164 #define fortran_mr2d PCTRMR2DO
165 #define ccopy_ CCOPY
166 #define clacpy_ CLACPY
167 #else
168 #define fortran_mr2d pctrmr2do
169 #define fortran_mr2dnew pctrmr2d
170 #define ccopy_ ccopy
171 #define clacpy_ clacpy
172 #endif
173 #define Clacpy Cctrlacpy
174 void Clacpy();
175 typedef struct {
176 float r, i;
177 } complex;
178 typedef struct {
179 int desctype;
180 int ctxt;
181 int m;
182 int n;
183 int nbrow;
184 int nbcol;
185 int sprow;
186 int spcol;
187 int lda;
188 } MDESC;
189 #define BLOCK_CYCLIC_2D 1
190 typedef struct {
191 int gstart;
192 int len;
193 } IDESC;
194 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
195 #define max(A,B) ((A)>(B)?(A):(B))
196 #define min(A,B) ((A)>(B)?(B):(A))
197 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
198 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
199 #ifdef MALLOCDEBUG
200 #define malloc mymalloc
201 #define free myfree
202 #define realloc myrealloc
203 #endif
204 /* Cblacs */
205 extern void Cblacs_pcoord();
206 extern int Cblacs_pnum();
207 extern void Csetpvmtids();
208 extern void Cblacs_get();
209 extern void Cblacs_pinfo();
210 extern void Cblacs_gridinfo();
211 extern void Cblacs_gridinit();
212 extern void Cblacs_exit();
213 extern void Cblacs_gridexit();
214 extern void Cblacs_setup();
215 extern void Cigebs2d();
216 extern void Cigebr2d();
217 extern void Cigesd2d();
218 extern void Cigerv2d();
219 extern void Cigsum2d();
220 extern void Cigamn2d();
221 extern void Cigamx2d();
222 extern void Ccgesd2d();
223 extern void Ccgerv2d();
224 /* lapack */
225 void clacpy_();
226 /* aux fonctions */
227 extern int localindice();
228 extern void *mr2d_malloc();
229 extern int ppcm();
230 extern int localsize();
231 extern int memoryblocksize();
232 extern int changeorigin();
233 extern void paramcheck();
234 /* tools and others function */
235 #define scanD0 ctrscanD0
236 #define dispmat ctrdispmat
237 #define setmemory ctrsetmemory
238 #define freememory ctrfreememory
239 #define scan_intervals ctrscan_intervals
240 extern void scanD0();
241 extern void dispmat();
242 extern void setmemory();
243 extern void freememory();
244 extern int scan_intervals();
245 extern void Cpctrmr2do();
246 extern void Cpctrmr2d();
247 /* some defines for Cpctrmr2do */
248 #define SENDBUFF 0
249 #define RECVBUFF 1
250 #define SIZEBUFF 2
251 #if 0
252 #define DEBUG
253 #endif
254 #ifndef DEBUG
255 #define NDEBUG
256 #endif
257 #include <stdio.h>
258 #include <stdlib.h>
259 #include <assert.h>
260 #define DESCLEN 9
261 void
fortran_mr2d(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B)262 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A,
263 B, ib, jb, desc_B)
264 char *uplo, *diag;
265 int *ia, *ib, *ja, *jb, *m, *n;
266 int desc_A[DESCLEN], desc_B[DESCLEN];
267 complex *A, *B;
268 {
269 Cpctrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
270 B, *ib, *jb, (MDESC *) desc_B);
271 return;
272 }
273 void
fortran_mr2dnew(uplo,diag,m,n,A,ia,ja,desc_A,B,ib,jb,desc_B,gcontext)274 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A,
275 B, ib, jb, desc_B, gcontext)
276 char *uplo, *diag;
277 int *ia, *ib, *ja, *jb, *m, *n;
278 int desc_A[DESCLEN], desc_B[DESCLEN];
279 complex *A, *B;
280 int *gcontext;
281 {
282 Cpctrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
283 B, *ib, *jb, (MDESC *) desc_B, *gcontext);
284 return;
285 }
286 static2 void init_chenille();
287 static2 int inter_len();
288 static2 int block2buff();
289 static2 void buff2block();
290 static2 void gridreshape();
291 void
Cpctrmr2do(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb)292 Cpctrmr2do(uplo, diag, m, n,
293 ptrmyblock, ia, ja, ma,
294 ptrmynewblock, ib, jb, mb)
295 char *uplo, *diag;
296 complex *ptrmyblock, *ptrmynewblock;
297 /* pointers to the memory location of the matrix and the redistributed matrix */
298 MDESC *ma;
299 MDESC *mb;
300 int ia, ja, ib, jb, m, n;
301 {
302 int dummy, nprocs;
303 int gcontext;
304 /* first we initialize a global grid which serve as a reference to
305 * communicate from grid a to grid b */
306 Cblacs_pinfo(&dummy, &nprocs);
307 Cblacs_get(0, 0, &gcontext);
308 Cblacs_gridinit(&gcontext, "R", 1, nprocs);
309 Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma,
310 ptrmynewblock, ib, jb, mb, gcontext);
311 Cblacs_gridexit(gcontext);
312 }
313 #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
314 * idem B puis ia,ja puis ib,jb */
315 #define MAGIC_MAX 100000000
316 void
Cpctrmr2d(uplo,diag,m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb,globcontext)317 Cpctrmr2d(uplo, diag, m, n,
318 ptrmyblock, ia, ja, ma,
319 ptrmynewblock, ib, jb, mb, globcontext)
320 char *uplo, *diag;
321 complex *ptrmyblock, *ptrmynewblock;
322 /* pointers to the memory location of the matrix and the redistributed matrix */
323 MDESC *ma;
324 MDESC *mb;
325 int ia, ja, ib, jb, m, n, globcontext;
326 {
327 complex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
328 complex *recvptr;
329 MDESC newa, newb;
330 int *proc0, *proc1, *param;
331 int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
332 int i, j;
333 int nprow, npcol, gcontext;
334 int recvsize, sendsize;
335 IDESC *h_inter; /* to store the horizontal intersections */
336 IDESC *v_inter; /* to store the vertical intersections */
337 int hinter_nb, vinter_nb; /* number of intrsections in both directions */
338 int dummy;
339 int p0, q0, p1, q1;
340 int *ra, *ca;
341 /* end of variables */
342 /* To simplify further calcul we change the matrix indexation from
343 * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
344 if (m == 0 || n == 0)
345 return;
346 ia -= 1;
347 ja -= 1;
348 ib -= 1;
349 jb -= 1;
350 Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
351 gcontext = globcontext;
352 nprocs = nprow * npcol;
353 /* if the global context that is given to us has not the shape of a line
354 * (nprow != 1), create a new context. TODO: to be optimal, we should
355 * avoid this because it is an uncessary synchronisation */
356 if (nprow != 1) {
357 gridreshape(&gcontext);
358 Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
359 }
360 Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
361 /* compatibility T3D, must check myprow and mypcol are within bounds */
362 if (myprow0 >= p0 || mypcol0 >= q0)
363 myprow0 = mypcol0 = -1;
364 assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
365 Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
366 if (myprow1 >= p1 || mypcol1 >= q1)
367 myprow1 = mypcol1 = -1;
368 assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
369 /* exchange the missing parameters among the processors: shape of grids and
370 * location of the processors */
371 param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
372 ra = param + nprocs * 2 + NBPARAM;
373 ca = param + (nprocs * 2 + NBPARAM) * 2;
374 for (i = 0; i < nprocs * 2 + NBPARAM; i++)
375 param[i] = MAGIC_MAX;
376 proc0 = param + NBPARAM;
377 proc1 = param + NBPARAM + nprocs;
378 /* we calulate proc0 and proc1 that will give the number of a proc in
379 * respectively a or b in the global context */
380 if (myprow0 >= 0) {
381 proc0[myprow0 * q0 + mypcol0] = mypnum;
382 param[0] = p0;
383 param[1] = q0;
384 param[4] = ma->m;
385 param[5] = ma->n;
386 param[6] = ma->nbrow;
387 param[7] = ma->nbcol;
388 param[8] = ma->sprow;
389 param[9] = ma->spcol;
390 param[10] = ia;
391 param[11] = ja;
392 }
393 if (myprow1 >= 0) {
394 proc1[myprow1 * q1 + mypcol1] = mypnum;
395 param[2] = p1;
396 param[3] = q1;
397 param[12] = mb->m;
398 param[13] = mb->n;
399 param[14] = mb->nbrow;
400 param[15] = mb->nbcol;
401 param[16] = mb->sprow;
402 param[17] = mb->spcol;
403 param[18] = ib;
404 param[19] = jb;
405 }
406 Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
407 ra, ca, 2 * nprocs + NBPARAM, -1, -1);
408 newa = *ma;
409 newb = *mb;
410 ma = &newa;
411 mb = &newb;
412 if (myprow0 == -1) {
413 p0 = param[0];
414 q0 = param[1];
415 ma->m = param[4];
416 ma->n = param[5];
417 ma->nbrow = param[6];
418 ma->nbcol = param[7];
419 ma->sprow = param[8];
420 ma->spcol = param[9];
421 ia = param[10];
422 ja = param[11];
423 }
424 if (myprow1 == -1) {
425 p1 = param[2];
426 q1 = param[3];
427 mb->m = param[12];
428 mb->n = param[13];
429 mb->nbrow = param[14];
430 mb->nbcol = param[15];
431 mb->sprow = param[16];
432 mb->spcol = param[17];
433 ib = param[18];
434 jb = param[19];
435 }
436 for (i = 0; i < NBPARAM; i++) {
437 if (param[i] == MAGIC_MAX) {
438 fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
439 exit(1);
440 }
441 }
442 #ifndef NDEBUG
443 for (i = 0; i < p0 * q0; i++)
444 assert(proc0[i] >= 0 && proc0[i] < nprocs);
445 for (i = 0; i < p1 * q1; i++)
446 assert(proc1[i] >= 0 && proc1[i] < nprocs);
447 #endif
448 /* check the validity of the parameters */
449 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
450 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
451 /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
452 {
453 int decal;
454 ia = changeorigin(myprow0, ma->sprow, p0,
455 ma->nbrow, ia, &decal, &ma->sprow);
456 ptrmyblock += decal;
457 ja = changeorigin(mypcol0, ma->spcol, q0,
458 ma->nbcol, ja, &decal, &ma->spcol);
459 ptrmyblock += decal * ma->lda;
460 ma->m = ia + m;
461 ma->n = ja + n;
462 ib = changeorigin(myprow1, mb->sprow, p1,
463 mb->nbrow, ib, &decal, &mb->sprow);
464 ptrmynewblock += decal;
465 jb = changeorigin(mypcol1, mb->spcol, q1,
466 mb->nbcol, jb, &decal, &mb->spcol);
467 ptrmynewblock += decal * mb->lda;
468 mb->m = ib + m;
469 mb->n = jb + n;
470 if (p0 == 1)
471 ma->nbrow = ma->m;
472 if (q0 == 1)
473 ma->nbcol = ma->n;
474 if (p1 == 1)
475 mb->nbrow = mb->m;
476 if (q1 == 1)
477 mb->nbcol = mb->n;
478 #ifndef NDEBUG
479 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
480 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
481 #endif
482 }
483 /* We compute the size of the memory buffer ( we choose the worst case,
484 * when the buffer sizes == the memory block sizes). */
485 if (myprow0 >= 0 && mypcol0 >= 0) {
486 /* Initialize pointer variables */
487 setmemory(&ptrsendbuff, memoryblocksize(ma));
488 }; /* if (mypnum < p0 * q0) */
489 if (myprow1 >= 0 && mypcol1 >= 0) {
490 /* Initialize pointer variables */
491 setmemory(&ptrrecvbuff, memoryblocksize(mb));
492 }; /* if (mypnum < p1 * q1) */
493 /* allocing room for the tabs, alloc for the worst case,local_n or local_m
494 * intervals, in fact the worst case should be less, perhaps half that,I
495 * should think of that one day. */
496 h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
497 ma->nbcol * sizeof(IDESC));
498 v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
499 * ma->nbrow * sizeof(IDESC));
500 /* We go for the scanning of indices. For each processor including mypnum,
501 * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
502 * it. Then for each processor, we compute the size of message to be
503 * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
504 * of recvbuff the right place (scanD)(RECVBUFF)) */
505 recvptr = ptrrecvbuff;
506 {
507 int tot, myrang, step, sens;
508 int *sender, *recver;
509 int mesending, merecving;
510 tot = max(p0 * q0, p1 * q1);
511 init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
512 &sender, &recver, &myrang);
513 if (myrang == -1)
514 goto after_comm;
515 mesending = myprow0 >= 0;
516 assert(sender[myrang] >= 0 || !mesending);
517 assert(!mesending || proc0[sender[myrang]] == mypnum);
518 merecving = myprow1 >= 0;
519 assert(recver[myrang] >= 0 || !merecving);
520 assert(!merecving || proc1[recver[myrang]] == mypnum);
521 step = tot - 1 - myrang;
522 do {
523 for (sens = 0; sens < 2; sens++) {
524 /* be careful here, when we communicating with ourselves, we must
525 * send first (myrang > step == 0) */
526 if (mesending && recver[step] >= 0 &&
527 (sens == 0)) {
528 i = recver[step] / q1;
529 j = recver[step] % q1;
530 vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
531 v_inter);
532 hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
533 h_inter);
534 scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize,
535 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
536 v_inter, vinter_nb, h_inter, hinter_nb,
537 ptrmyblock);
538 } /* if (mesending...) { */
539 if (mesending && recver[step] >= 0 &&
540 (sens == myrang > step)) {
541 i = recver[step] / q1;
542 j = recver[step] % q1;
543 if (sendsize > 0
544 && (step != myrang || !merecving)
545 ) {
546 Ccgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
547 0, proc1[i * q1 + j]);
548 } /* sendsize > 0 */
549 } /* if (mesending ... */
550 if (merecving && sender[step] >= 0 &&
551 (sens == myrang <= step)) {
552 i = sender[step] / q0;
553 j = sender[step] % q0;
554 vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
555 v_inter);
556 hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
557 h_inter);
558 scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize,
559 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
560 v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL);
561 if (recvsize > 0) {
562 if (step == myrang && mesending) {
563 Clacpy(recvsize, 1,
564 ptrsendbuff, recvsize,
565 ptrrecvbuff, recvsize);
566 } else {
567 Ccgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
568 0, proc0[i * q0 + j]);
569 }
570 } /* recvsize > 0 */
571 } /* if (merecving ...) */
572 if (merecving && sender[step] >= 0 && sens == 1) {
573 scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize,
574 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
575 v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock);
576 } /* if (merecving...) */
577 } /* for (sens = 0) */
578 step -= 1;
579 if (step < 0)
580 step = tot - 1;
581 } while (step != tot - 1 - myrang);
582 after_comm:
583 free(sender);
584 } /* { int tot,nr,ns ...} */
585 /* don't forget to clean up things! */
586 if (myprow1 >= 0 && mypcol1 >= 0) {
587 freememory((char *) ptrrecvbuff);
588 };
589 if (myprow0 >= 0 && mypcol0 >= 0) {
590 freememory((char *) ptrsendbuff);
591 };
592 if (nprow != 1)
593 Cblacs_gridexit(gcontext);
594 free(v_inter);
595 free(h_inter);
596 free(param);
597 }/* distrib */
598 static2 void
init_chenille(mypnum,nprocs,n0,proc0,n1,proc1,psend,precv,myrang)599 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
600 int nprocs, mypnum, n0, n1;
601 int *proc0, *proc1, **psend, **precv, *myrang;
602 {
603 int ns, nr, i, tot;
604 int *sender, *recver, *g0, *g1;
605 tot = max(n0, n1);
606 sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
607 recver = sender + tot;
608 *psend = sender;
609 *precv = recver;
610 g0 = recver + tot;
611 g1 = g0 + nprocs;
612 for (i = 0; i < nprocs; i++) {
613 g0[i] = -1;
614 g1[i] = -1;
615 }
616 for (i = 0; i < tot; i++) {
617 sender[i] = -1;
618 recver[i] = -1;
619 }
620 for (i = 0; i < n0; i++)
621 g0[proc0[i]] = i;
622 for (i = 0; i < n1; i++)
623 g1[proc1[i]] = i;
624 ns = 0;
625 nr = 0;
626 *myrang = -1;
627 for (i = 0; i < nprocs; i++)
628 if (g0[i] >= 0 && g1[i] >= 0) {
629 if (i == mypnum)
630 *myrang = nr;
631 sender[ns] = g0[i];
632 ns += 1;
633 recver[nr] = g1[i];
634 nr += 1;
635 assert(ns <= n0 && nr <= n1 && nr == ns);
636 }
637 for (i = 0; i < nprocs; i++)
638 if (g0[i] >= 0 && g1[i] < 0) {
639 if (i == mypnum)
640 *myrang = ns;
641 sender[ns] = g0[i];
642 ns += 1;
643 assert(ns <= n0);
644 }
645 for (i = 0; i < nprocs; i++)
646 if (g1[i] >= 0 && g0[i] < 0) {
647 if (i == mypnum)
648 *myrang = nr;
649 recver[nr] = g1[i];
650 nr += 1;
651 assert(nr <= n1);
652 }
653 }
654 void
Clacpy(m,n,a,lda,b,ldb)655 Clacpy(m, n, a, lda, b, ldb)
656 complex *a, *b;
657 int m, n, lda, ldb;
658 {
659 int i, j;
660 lda -= m;
661 ldb -= m;
662 assert(lda >= 0 && ldb >= 0);
663 for (j = 0; j < n; j++) {
664 for (i = 0; i < m; i++)
665 *b++ = *a++;
666 b += ldb;
667 a += lda;
668 }
669 }
670 static2 void
gridreshape(ctxtp)671 gridreshape(ctxtp)
672 int *ctxtp;
673 {
674 int ori, final; /* original context, and new context created, with
675 * line form */
676 int nprow, npcol, myrow, mycol;
677 int *usermap;
678 int i, j;
679 ori = *ctxtp;
680 Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
681 usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
682 for (i = 0; i < nprow; i++)
683 for (j = 0; j < npcol; j++) {
684 usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
685 }
686 /* Cblacs_get(0, 0, &final); */
687 Cblacs_get(ori, 10, &final);
688 Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
689 *ctxtp = final;
690 free(usermap);
691 }
692