1 #include "redist.h"
2 /** $Id: psgemr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $
3 ------------------------------------------------------------------------
4
5 -- ScaLAPACK routine (version 1.7) --
6 Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of
7 California, Berkeley.
8 October 31, 1994.
9
10 SUBROUTINE PSGEMR2D( M, N,
11 $ A, IA, JA, ADESC,
12 $ B, IB, JB, BDESC,
13 $ CTXT)
14 ------------------------------------------------------------------------
15 Purpose
16 =======
17
18 PSGEMR2D copies a submatrix of A on a submatrix of B.
19 A and B can have different distributions: they can be on different
20 processor grids, they can have different blocksizes, the beginning
21 of the area to be copied can be at a different places on A and B.
22
23 The parameters can be confusing when the grids of A and B are
24 partially or completly disjoint, in the case a processor calls
25 this routines but is either not in the A context or B context, the
26 ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the
27 routine recognise this situation.
28 To summarize the rule:
29 - If a processor is in A context, all parameters related to A must be valid.
30 - If a processor is in B context, all parameters related to B must be valid.
31 - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1.
32 - M and N must be valid for everyone.
33 - other parameters are not examined.
34
35
36 Notes
37 =====
38
39 A description vector is associated with each 2D block-cyclicly dis-
40 tributed matrix. This vector stores the information required to
41 establish the mapping between a matrix entry and its corresponding
42 process and memory location.
43
44 In the following comments, the character _ should be read as
45 "of the distributed matrix". Let A be a generic term for any 2D
46 block cyclicly distributed matrix. Its description vector is DESC_A:
47
48 NOTATION STORED IN EXPLANATION
49 --------------- -------------- --------------------------------------
50 DT_A (global) DESCA( DT_ ) The descriptor type.
51 CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
52 the BLACS process grid A is distribu-
53 ted over. The context itself is glo-
54 bal, but the handle (the integer
55 value) may vary.
56 M_A (global) DESCA( M_ ) The number of rows in the distributed
57 matrix A.
58 N_A (global) DESCA( N_ ) The number of columns in the distri-
59 buted matrix A.
60 MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
61 the rows of A.
62 NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
63 the columns of A.
64 RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
65 row of the matrix A is distributed.
66 CSRC_A (global) DESCA( CSRC_ ) The process column over which the
67 first column of A is distributed.
68 LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
69 array storing the local blocks of the
70 distributed matrix A.
71 LLD_A >= MAX(1,LOCp(M_A)).
72
73
74
75 Important notice
76 ================
77 The parameters of the routine have changed in April 1996
78 There is a new last argument. It must be a context englobing
79 all processors involved in the initial and final distribution.
80
81 Be aware that all processors included in this
82 context must call the redistribution routine.
83
84 Parameters
85 ==========
86
87
88 M (input) INTEGER.
89 On entry, M specifies the number of rows of the
90 submatrix to be copied. M must be at least zero.
91 Unchanged on exit.
92
93 N (input) INTEGER.
94 On entry, N specifies the number of cols of the submatrix
95 to be redistributed.rows of B. M must be at least zero.
96 Unchanged on exit.
97
98 A (input) REAL
99 On entry, the source matrix.
100 Unchanged on exit.
101
102 IA,JA (input) INTEGER
103 On entry,the coordinates of the beginning of the submatrix
104 of A to copy.
105 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1,
106 Unchanged on exit.
107
108 ADESC (input) A description vector (see Notes above)
109 If the current processor is not part of the context of A
110 the ADESC[CTXT] must be equal to -1.
111
112
113 B (output) REAL
114 On entry, the destination matrix.
115 The portion corresponding to the defined submatrix are updated.
116
117 IB,JB (input) INTEGER
118 On entry,the coordinates of the beginning of the submatrix
119 of B that will be updated.
120 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1,
121 Unchanged on exit.
122
123 BDESC (input) B description vector (see Notes above)
124 For processors not part of the context of B
125 BDESC[CTXT] must be equal to -1.
126
127 CTXT (input) a context englobing at least all processors included
128 in either A context or B context
129
130
131
132 Memory requirement :
133 ====================
134
135 for the processors belonging to grid 0, one buffer of size block 0
136 and for the processors belonging to grid 1, also one buffer of size
137 block 1.
138
139 ============================================================
140 Created March 1993 by B. Tourancheau (See sccs for modifications).
141 Modifications by Loic PRYLLI 1995
142 ============================================================ */
143 #define static2 static
144 #if defined(Add_) || defined(f77IsF2C)
145 #define fortran_mr2d psgemr2do_
146 #define fortran_mr2dnew psgemr2d_
147 #elif defined(UpCase)
148 #define fortran_mr2dnew PSGEMR2D
149 #define fortran_mr2d PSGEMR2DO
150 #define scopy_ SCOPY
151 #define slacpy_ SLACPY
152 #else
153 #define fortran_mr2d psgemr2do
154 #define fortran_mr2dnew psgemr2d
155 #define scopy_ scopy
156 #define slacpy_ slacpy
157 #endif
158 #define Clacpy Csgelacpy
159 void Clacpy();
160 typedef struct {
161 int desctype;
162 int ctxt;
163 int m;
164 int n;
165 int nbrow;
166 int nbcol;
167 int sprow;
168 int spcol;
169 int lda;
170 } MDESC;
171 #define BLOCK_CYCLIC_2D 1
172 typedef struct {
173 int lstart;
174 int len;
175 } IDESC;
176 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
177 #define max(A,B) ((A)>(B)?(A):(B))
178 #define min(A,B) ((A)>(B)?(B):(A))
179 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
180 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
181 #ifdef MALLOCDEBUG
182 #define malloc mymalloc
183 #define free myfree
184 #define realloc myrealloc
185 #endif
186 /* Cblacs */
187 extern void Cblacs_pcoord();
188 extern int Cblacs_pnum();
189 extern void Csetpvmtids();
190 extern void Cblacs_get();
191 extern void Cblacs_pinfo();
192 extern void Cblacs_gridinfo();
193 extern void Cblacs_gridinit();
194 extern void Cblacs_exit();
195 extern void Cblacs_gridexit();
196 extern void Cblacs_setup();
197 extern void Cigebs2d();
198 extern void Cigebr2d();
199 extern void Cigesd2d();
200 extern void Cigerv2d();
201 extern void Cigsum2d();
202 extern void Cigamn2d();
203 extern void Cigamx2d();
204 extern void Csgesd2d();
205 extern void Csgerv2d();
206 /* lapack */
207 void slacpy_();
208 /* aux fonctions */
209 extern int localindice();
210 extern void *mr2d_malloc();
211 extern int ppcm();
212 extern int localsize();
213 extern int memoryblocksize();
214 extern int changeorigin();
215 extern void paramcheck();
216 /* tools and others function */
217 #define scanD0 sgescanD0
218 #define dispmat sgedispmat
219 #define setmemory sgesetmemory
220 #define freememory sgefreememory
221 #define scan_intervals sgescan_intervals
222 extern void scanD0();
223 extern void dispmat();
224 extern void setmemory();
225 extern void freememory();
226 extern int scan_intervals();
227 extern void Cpsgemr2do();
228 extern void Cpsgemr2d();
229 /* some defines for Cpsgemr2do */
230 #define SENDBUFF 0
231 #define RECVBUFF 1
232 #define SIZEBUFF 2
233 #if 0
234 #define DEBUG
235 #endif
236 #ifndef DEBUG
237 #define NDEBUG
238 #endif
239 #include <stdio.h>
240 #include <stdlib.h>
241 #include <assert.h>
242 #define DESCLEN 9
243 void
fortran_mr2d(m,n,A,ia,ja,desc_A,B,ib,jb,desc_B)244 fortran_mr2d(m, n, A, ia, ja, desc_A,
245 B, ib, jb, desc_B)
246 int *ia, *ib, *ja, *jb, *m, *n;
247 int desc_A[DESCLEN], desc_B[DESCLEN];
248 float *A, *B;
249 {
250 Cpsgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A,
251 B, *ib, *jb, (MDESC *) desc_B);
252 return;
253 }
254 void
fortran_mr2dnew(m,n,A,ia,ja,desc_A,B,ib,jb,desc_B,gcontext)255 fortran_mr2dnew(m, n, A, ia, ja, desc_A,
256 B, ib, jb, desc_B, gcontext)
257 int *ia, *ib, *ja, *jb, *m, *n;
258 int desc_A[DESCLEN], desc_B[DESCLEN];
259 float *A, *B;
260 int *gcontext;
261 {
262 Cpsgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A,
263 B, *ib, *jb, (MDESC *) desc_B, *gcontext);
264 return;
265 }
266 static2 void init_chenille();
267 static2 int inter_len();
268 static2 int block2buff();
269 static2 void buff2block();
270 static2 void gridreshape();
271 void
Cpsgemr2do(m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb)272 Cpsgemr2do(m, n,
273 ptrmyblock, ia, ja, ma,
274 ptrmynewblock, ib, jb, mb)
275 float *ptrmyblock, *ptrmynewblock;
276 /* pointers to the memory location of the matrix and the redistributed matrix */
277 MDESC *ma;
278 MDESC *mb;
279 int ia, ja, ib, jb, m, n;
280 {
281 int dummy, nprocs;
282 int gcontext;
283 /* first we initialize a global grid which serve as a reference to
284 * communicate from grid a to grid b */
285 Cblacs_pinfo(&dummy, &nprocs);
286 Cblacs_get(0, 0, &gcontext);
287 Cblacs_gridinit(&gcontext, "R", 1, nprocs);
288 Cpsgemr2d(m, n, ptrmyblock, ia, ja, ma,
289 ptrmynewblock, ib, jb, mb, gcontext);
290 Cblacs_gridexit(gcontext);
291 }
292 #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
293 * idem B puis ia,ja puis ib,jb */
294 #define MAGIC_MAX 100000000
295 void
Cpsgemr2d(m,n,ptrmyblock,ia,ja,ma,ptrmynewblock,ib,jb,mb,globcontext)296 Cpsgemr2d(m, n,
297 ptrmyblock, ia, ja, ma,
298 ptrmynewblock, ib, jb, mb, globcontext)
299 float *ptrmyblock, *ptrmynewblock;
300 /* pointers to the memory location of the matrix and the redistributed matrix */
301 MDESC *ma;
302 MDESC *mb;
303 int ia, ja, ib, jb, m, n, globcontext;
304 {
305 float *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
306 float *recvptr;
307 MDESC newa, newb;
308 int *proc0, *proc1, *param;
309 int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
310 int i, j;
311 int nprow, npcol, gcontext;
312 int recvsize, sendsize;
313 IDESC *h_inter; /* to store the horizontal intersections */
314 IDESC *v_inter; /* to store the vertical intersections */
315 int hinter_nb, vinter_nb; /* number of intrsections in both directions */
316 int dummy;
317 int p0, q0, p1, q1;
318 int *ra, *ca;
319 /* end of variables */
320 /* To simplify further calcul we change the matrix indexation from
321 * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
322 if (m == 0 || n == 0)
323 return;
324 ia -= 1;
325 ja -= 1;
326 ib -= 1;
327 jb -= 1;
328 Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
329 gcontext = globcontext;
330 nprocs = nprow * npcol;
331 /* if the global context that is given to us has not the shape of a line
332 * (nprow != 1), create a new context. TODO: to be optimal, we should
333 * avoid this because it is an uncessary synchronisation */
334 if (nprow != 1) {
335 gridreshape(&gcontext);
336 Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
337 }
338 Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
339 /* compatibility T3D, must check myprow and mypcol are within bounds */
340 if (myprow0 >= p0 || mypcol0 >= q0)
341 myprow0 = mypcol0 = -1;
342 assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
343 Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
344 if (myprow1 >= p1 || mypcol1 >= q1)
345 myprow1 = mypcol1 = -1;
346 assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
347 /* exchange the missing parameters among the processors: shape of grids and
348 * location of the processors */
349 param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
350 ra = param + nprocs * 2 + NBPARAM;
351 ca = param + (nprocs * 2 + NBPARAM) * 2;
352 for (i = 0; i < nprocs * 2 + NBPARAM; i++)
353 param[i] = MAGIC_MAX;
354 proc0 = param + NBPARAM;
355 proc1 = param + NBPARAM + nprocs;
356 /* we calulate proc0 and proc1 that will give the number of a proc in
357 * respectively a or b in the global context */
358 if (myprow0 >= 0) {
359 proc0[myprow0 * q0 + mypcol0] = mypnum;
360 param[0] = p0;
361 param[1] = q0;
362 param[4] = ma->m;
363 param[5] = ma->n;
364 param[6] = ma->nbrow;
365 param[7] = ma->nbcol;
366 param[8] = ma->sprow;
367 param[9] = ma->spcol;
368 param[10] = ia;
369 param[11] = ja;
370 }
371 if (myprow1 >= 0) {
372 proc1[myprow1 * q1 + mypcol1] = mypnum;
373 param[2] = p1;
374 param[3] = q1;
375 param[12] = mb->m;
376 param[13] = mb->n;
377 param[14] = mb->nbrow;
378 param[15] = mb->nbcol;
379 param[16] = mb->sprow;
380 param[17] = mb->spcol;
381 param[18] = ib;
382 param[19] = jb;
383 }
384 Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
385 ra, ca, 2 * nprocs + NBPARAM, -1, -1);
386 newa = *ma;
387 newb = *mb;
388 ma = &newa;
389 mb = &newb;
390 if (myprow0 == -1) {
391 p0 = param[0];
392 q0 = param[1];
393 ma->m = param[4];
394 ma->n = param[5];
395 ma->nbrow = param[6];
396 ma->nbcol = param[7];
397 ma->sprow = param[8];
398 ma->spcol = param[9];
399 ia = param[10];
400 ja = param[11];
401 }
402 if (myprow1 == -1) {
403 p1 = param[2];
404 q1 = param[3];
405 mb->m = param[12];
406 mb->n = param[13];
407 mb->nbrow = param[14];
408 mb->nbcol = param[15];
409 mb->sprow = param[16];
410 mb->spcol = param[17];
411 ib = param[18];
412 jb = param[19];
413 }
414 for (i = 0; i < NBPARAM; i++) {
415 if (param[i] == MAGIC_MAX) {
416 fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
417 exit(1);
418 }
419 }
420 #ifndef NDEBUG
421 for (i = 0; i < p0 * q0; i++)
422 assert(proc0[i] >= 0 && proc0[i] < nprocs);
423 for (i = 0; i < p1 * q1; i++)
424 assert(proc1[i] >= 0 && proc1[i] < nprocs);
425 #endif
426 /* check the validity of the parameters */
427 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
428 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
429 /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
430 {
431 int decal;
432 ia = changeorigin(myprow0, ma->sprow, p0,
433 ma->nbrow, ia, &decal, &ma->sprow);
434 ptrmyblock += decal;
435 ja = changeorigin(mypcol0, ma->spcol, q0,
436 ma->nbcol, ja, &decal, &ma->spcol);
437 ptrmyblock += decal * ma->lda;
438 ma->m = ia + m;
439 ma->n = ja + n;
440 ib = changeorigin(myprow1, mb->sprow, p1,
441 mb->nbrow, ib, &decal, &mb->sprow);
442 ptrmynewblock += decal;
443 jb = changeorigin(mypcol1, mb->spcol, q1,
444 mb->nbcol, jb, &decal, &mb->spcol);
445 ptrmynewblock += decal * mb->lda;
446 mb->m = ib + m;
447 mb->n = jb + n;
448 if (p0 == 1)
449 ma->nbrow = ma->m;
450 if (q0 == 1)
451 ma->nbcol = ma->n;
452 if (p1 == 1)
453 mb->nbrow = mb->m;
454 if (q1 == 1)
455 mb->nbcol = mb->n;
456 #ifndef NDEBUG
457 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
458 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
459 #endif
460 }
461 /* We compute the size of the memory buffer ( we choose the worst case,
462 * when the buffer sizes == the memory block sizes). */
463 if (myprow0 >= 0 && mypcol0 >= 0) {
464 /* Initialize pointer variables */
465 setmemory(&ptrsendbuff, memoryblocksize(ma));
466 }; /* if (mypnum < p0 * q0) */
467 if (myprow1 >= 0 && mypcol1 >= 0) {
468 /* Initialize pointer variables */
469 setmemory(&ptrrecvbuff, memoryblocksize(mb));
470 }; /* if (mypnum < p1 * q1) */
471 /* allocing room for the tabs, alloc for the worst case,local_n or local_m
472 * intervals, in fact the worst case should be less, perhaps half that,I
473 * should think of that one day. */
474 h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
475 ma->nbcol * sizeof(IDESC));
476 v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
477 * ma->nbrow * sizeof(IDESC));
478 /* We go for the scanning of indices. For each processor including mypnum,
479 * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
480 * it. Then for each processor, we compute the size of message to be
481 * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
482 * of recvbuff the right place (scanD)(RECVBUFF)) */
483 recvptr = ptrrecvbuff;
484 {
485 int tot, myrang, step, sens;
486 int *sender, *recver;
487 int mesending, merecving;
488 tot = max(p0 * q0, p1 * q1);
489 init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
490 &sender, &recver, &myrang);
491 if (myrang == -1)
492 goto after_comm;
493 mesending = myprow0 >= 0;
494 assert(sender[myrang] >= 0 || !mesending);
495 assert(!mesending || proc0[sender[myrang]] == mypnum);
496 merecving = myprow1 >= 0;
497 assert(recver[myrang] >= 0 || !merecving);
498 assert(!merecving || proc1[recver[myrang]] == mypnum);
499 step = tot - 1 - myrang;
500 do {
501 for (sens = 0; sens < 2; sens++) {
502 /* be careful here, when we communicating with ourselves, we must
503 * send first (myrang > step == 0) */
504 if (mesending && recver[step] >= 0 &&
505 (sens == 0)) {
506 i = recver[step] / q1;
507 j = recver[step] % q1;
508 vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
509 v_inter);
510 hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
511 h_inter);
512 sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb,
513 ptrmyblock, ma, ptrsendbuff);
514 } /* if (mesending...) { */
515 if (mesending && recver[step] >= 0 &&
516 (sens == myrang > step)) {
517 i = recver[step] / q1;
518 j = recver[step] % q1;
519 if (sendsize > 0
520 && (step != myrang || !merecving)
521 ) {
522 Csgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
523 0, proc1[i * q1 + j]);
524 } /* sendsize > 0 */
525 } /* if (mesending ... */
526 if (merecving && sender[step] >= 0 &&
527 (sens == myrang <= step)) {
528 i = sender[step] / q0;
529 j = sender[step] % q0;
530 vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
531 v_inter);
532 hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
533 h_inter);
534 recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter);
535 if (recvsize > 0) {
536 if (step == myrang && mesending) {
537 Clacpy(recvsize, 1,
538 ptrsendbuff, recvsize,
539 ptrrecvbuff, recvsize);
540 } else {
541 Csgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
542 0, proc0[i * q0 + j]);
543 }
544 } /* recvsize > 0 */
545 } /* if (merecving ...) */
546 if (merecving && sender[step] >= 0 && sens == 1) {
547 buff2block(v_inter, vinter_nb, h_inter, hinter_nb,
548 recvptr, ptrmynewblock, mb);
549 } /* if (merecving...) */
550 } /* for (sens = 0) */
551 step -= 1;
552 if (step < 0)
553 step = tot - 1;
554 } while (step != tot - 1 - myrang);
555 after_comm:
556 free(sender);
557 } /* { int tot,nr,ns ...} */
558 /* don't forget to clean up things! */
559 if (myprow1 >= 0 && mypcol1 >= 0) {
560 freememory((char *) ptrrecvbuff);
561 };
562 if (myprow0 >= 0 && mypcol0 >= 0) {
563 freememory((char *) ptrsendbuff);
564 };
565 if (nprow != 1)
566 Cblacs_gridexit(gcontext);
567 free(v_inter);
568 free(h_inter);
569 free(param);
570 }/* distrib */
571 static2 void
init_chenille(mypnum,nprocs,n0,proc0,n1,proc1,psend,precv,myrang)572 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
573 int nprocs, mypnum, n0, n1;
574 int *proc0, *proc1, **psend, **precv, *myrang;
575 {
576 int ns, nr, i, tot;
577 int *sender, *recver, *g0, *g1;
578 tot = max(n0, n1);
579 sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
580 recver = sender + tot;
581 *psend = sender;
582 *precv = recver;
583 g0 = recver + tot;
584 g1 = g0 + nprocs;
585 for (i = 0; i < nprocs; i++) {
586 g0[i] = -1;
587 g1[i] = -1;
588 }
589 for (i = 0; i < tot; i++) {
590 sender[i] = -1;
591 recver[i] = -1;
592 }
593 for (i = 0; i < n0; i++)
594 g0[proc0[i]] = i;
595 for (i = 0; i < n1; i++)
596 g1[proc1[i]] = i;
597 ns = 0;
598 nr = 0;
599 *myrang = -1;
600 for (i = 0; i < nprocs; i++)
601 if (g0[i] >= 0 && g1[i] >= 0) {
602 if (i == mypnum)
603 *myrang = nr;
604 sender[ns] = g0[i];
605 ns += 1;
606 recver[nr] = g1[i];
607 nr += 1;
608 assert(ns <= n0 && nr <= n1 && nr == ns);
609 }
610 for (i = 0; i < nprocs; i++)
611 if (g0[i] >= 0 && g1[i] < 0) {
612 if (i == mypnum)
613 *myrang = ns;
614 sender[ns] = g0[i];
615 ns += 1;
616 assert(ns <= n0);
617 }
618 for (i = 0; i < nprocs; i++)
619 if (g1[i] >= 0 && g0[i] < 0) {
620 if (i == mypnum)
621 *myrang = nr;
622 recver[nr] = g1[i];
623 nr += 1;
624 assert(nr <= n1);
625 }
626 }
627 #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \
628 { \
629 float *_a,*_b; \
630 int _m,_n,_lda,_ldb; \
631 int _i,_j; \
632 _m = (mo);_n = (no); \
633 _a = (ao);_b = (bo); \
634 _lda = (ldao) - _m; \
635 _ldb = (ldbo) - _m; \
636 assert(_lda >= 0 && _ldb >= 0); \
637 for (_j=0;_j<_n;_j++) { \
638 for (_i=0;_i<_m;_i++) \
639 *_b++ = *_a++; \
640 _b += _ldb; \
641 _a += _lda; \
642 } \
643 }
644 static2 int
block2buff(vi,vinb,hi,hinb,ptra,ma,buff)645 block2buff(vi, vinb, hi, hinb, ptra, ma, buff)
646 int hinb, vinb;
647 IDESC *hi, *vi;
648 MDESC *ma;
649 float *buff, *ptra;
650 {
651 int h, v, sizebuff;
652 float *ptr2;
653 sizebuff = 0;
654 for (h = 0; h < hinb; h++) {
655 ptr2 = ptra + hi[h].lstart * ma->lda;
656 for (v = 0; v < vinb; v++) {
657 Mlacpy(vi[v].len, hi[h].len,
658 ptr2 + vi[v].lstart,
659 ma->lda,
660 buff + sizebuff, vi[v].len);
661 sizebuff += hi[h].len * vi[v].len;
662 }
663 }
664 return sizebuff;
665 }
666 static2 void
buff2block(vi,vinb,hi,hinb,buff,ptrb,mb)667 buff2block(vi, vinb, hi, hinb, buff, ptrb, mb)
668 int hinb, vinb;
669 IDESC *hi, *vi;
670 MDESC *mb;
671 float *buff, *ptrb;
672 {
673 int h, v, sizebuff;
674 float *ptr2;
675 sizebuff = 0;
676 for (h = 0; h < hinb; h++) {
677 ptr2 = ptrb + hi[h].lstart * mb->lda;
678 for (v = 0; v < vinb; v++) {
679 Mlacpy(vi[v].len, hi[h].len,
680 buff + sizebuff, vi[v].len,
681 ptr2 + vi[v].lstart,
682 mb->lda);
683 sizebuff += hi[h].len * vi[v].len;
684 }
685 }
686 }
687 static2 int
inter_len(hinb,hi,vinb,vi)688 inter_len(hinb, hi, vinb, vi)
689 int hinb, vinb;
690 IDESC *hi, *vi;
691 {
692 int hlen, vlen, h, v;
693 hlen = 0;
694 for (h = 0; h < hinb; h++)
695 hlen += hi[h].len;
696 vlen = 0;
697 for (v = 0; v < vinb; v++)
698 vlen += vi[v].len;
699 return hlen * vlen;
700 }
701 void
Clacpy(m,n,a,lda,b,ldb)702 Clacpy(m, n, a, lda, b, ldb)
703 float *a, *b;
704 int m, n, lda, ldb;
705 {
706 int i, j;
707 lda -= m;
708 ldb -= m;
709 assert(lda >= 0 && ldb >= 0);
710 for (j = 0; j < n; j++) {
711 for (i = 0; i < m; i++)
712 *b++ = *a++;
713 b += ldb;
714 a += lda;
715 }
716 }
717 static2 void
gridreshape(ctxtp)718 gridreshape(ctxtp)
719 int *ctxtp;
720 {
721 int ori, final; /* original context, and new context created, with
722 * line form */
723 int nprow, npcol, myrow, mycol;
724 int *usermap;
725 int i, j;
726 ori = *ctxtp;
727 Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
728 usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
729 for (i = 0; i < nprow; i++)
730 for (j = 0; j < npcol; j++) {
731 usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
732 }
733 /* Cblacs_get(0, 0, &final); */
734 Cblacs_get(ori, 10, &final);
735 Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
736 *ctxtp = final;
737 free(usermap);
738 }
739