1 #include "redist.h"
2 /* $Id: pgemraux.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $
3  *
4  * some functions used by the pigemr2d routine see file pigemr.c for more
5  * documentation.
6  *
7  * Created March 1993 by B. Tourancheau (See sccs for modifications). */
8 #define static2 static
9 #if defined(Add_) || defined(f77IsF2C)
10 #define fortran_mr2d pigemr2do_
11 #define fortran_mr2dnew pigemr2d_
12 #elif defined(UpCase)
13 #define fortran_mr2dnew PIGEMR2D
14 #define fortran_mr2d PIGEMR2DO
15 #define icopy_ ICOPY
16 #define ilacpy_ ILACPY
17 #else
18 #define fortran_mr2d pigemr2do
19 #define fortran_mr2dnew pigemr2d
20 #define icopy_ icopy
21 #define ilacpy_ ilacpy
22 #endif
23 #define Clacpy Cigelacpy
24 void  Clacpy();
25 typedef struct {
26   int   desctype;
27   int   ctxt;
28   int   m;
29   int   n;
30   int   nbrow;
31   int   nbcol;
32   int   sprow;
33   int   spcol;
34   int   lda;
35 }     MDESC;
36 #define BLOCK_CYCLIC_2D 1
37 typedef struct {
38   int   lstart;
39   int   len;
40 }     IDESC;
41 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
42 #define max(A,B) ((A)>(B)?(A):(B))
43 #define min(A,B) ((A)>(B)?(B):(A))
44 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
45 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
46 #ifdef MALLOCDEBUG
47 #define malloc mymalloc
48 #define free myfree
49 #define realloc myrealloc
50 #endif
51 /* Cblacs */
52 extern void Cblacs_pcoord();
53 extern int Cblacs_pnum();
54 extern void Csetpvmtids();
55 extern void Cblacs_get();
56 extern void Cblacs_pinfo();
57 extern void Cblacs_gridinfo();
58 extern void Cblacs_gridinit();
59 extern void Cblacs_exit();
60 extern void Cblacs_gridexit();
61 extern void Cblacs_setup();
62 extern void Cigebs2d();
63 extern void Cigebr2d();
64 extern void Cigesd2d();
65 extern void Cigerv2d();
66 extern void Cigsum2d();
67 extern void Cigamn2d();
68 extern void Cigamx2d();
69 extern void Cigesd2d();
70 extern void Cigerv2d();
71 /* lapack */
72 void  ilacpy_();
73 /* aux fonctions */
74 extern int localindice();
75 extern void *mr2d_malloc();
76 extern int ppcm();
77 extern int localsize();
78 extern int memoryblocksize();
79 extern int changeorigin();
80 extern void paramcheck();
81 /* tools and others function */
82 #define scanD0 igescanD0
83 #define dispmat igedispmat
84 #define setmemory igesetmemory
85 #define freememory igefreememory
86 #define scan_intervals igescan_intervals
87 extern void scanD0();
88 extern void dispmat();
89 extern void setmemory();
90 extern void freememory();
91 extern int scan_intervals();
92 extern void Cpigemr2do();
93 extern void Cpigemr2d();
94 /* some defines for Cpigemr2do */
95 #define SENDBUFF 0
96 #define RECVBUFF 1
97 #define SIZEBUFF 2
98 #if 0
99 #define DEBUG
100 #endif
101 #ifndef DEBUG
102 #define NDEBUG
103 #endif
104 #include <stdio.h>
105 #include <stdlib.h>
106 #include <assert.h>
107 void *
mr2d_malloc(n)108 mr2d_malloc(n)
109   int   n;
110 {
111   void *ptr;
112   assert(n > 0);
113   ptr = (void *) malloc(n);
114   if (ptr == NULL) {
115     fprintf(stderr, "xxmr2d:out of memory\n");
116     exit(2);
117   }
118   return ptr;
119 }
120 int
pgcd(a,b)121 pgcd(a, b)
122   int   a, b;
123 {
124   int   aux;
125   if (a < b)
126     return pgcd(b, a);
127   else {
128     aux = a % b;
129     if (aux == 0)
130       return b;
131     else
132       return pgcd(b, aux);
133   }
134 }
135 int
ppcm(a,b)136 ppcm(a, b)
137   int   a, b;
138 {
139   int   pg;
140   pg = pgcd(a, b);
141   return a * (b / pg);
142 }
143 /* localsize:return the number of rows on the local processor given by its
144  * row number myprow, of a distributed matrix with m rows distributed of on a
145  * grid of processors with p rows with blocksize nbrow : this procedure can
146  * also be used to compute the number of cols by replacing rows by cols */
147 int
localsize(myprow,p,nbrow,m)148 localsize(myprow, p, nbrow, m)
149   int   myprow, p, nbrow, m;
150 {
151   int   templateheight, blockheight;
152   templateheight = p * nbrow;
153   if (m % templateheight != 0) {	/* not an exact boundary */
154     if ((m % templateheight) > (nbrow * myprow)) {	/* processor
155 							 * (myprow,mypcol) has
156 							 * some elements in that
157 							 * incomplete template */
158       if ((m % templateheight) >= (nbrow * (myprow + 1))) {	/* processor
159 								 * (myprow,mypcol)'s
160 								 * part is complete */
161 	blockheight = (m / templateheight) * nbrow + nbrow;
162       } else {	/* processor (myprow,mypcol)'s part is not complete */
163 	blockheight = (m / templateheight) * nbrow + (m % nbrow);
164       };	/* if ((m%templateheight) > (nbrow*(myprow+1))) */
165     } else {	/* processor (myprow,mypcol) has no element in that
166 		 * incomplete template */
167       blockheight = (m / templateheight) * nbrow;
168     };	/* if ((m%templateheight) > (nbrow*myprow)) */
169   } else {	/* exact boundary */
170     blockheight = m / p;	/* (m/templateheight) * nbrow */
171   };	/* if (m%templateheight !=0) */
172   return blockheight;
173 }
174 /****************************************************************/
175 /* Returns the exact memory block size corresponding to the parameters */
176 int
memoryblocksize(a)177 memoryblocksize(a)
178   MDESC *a;
179 {
180   int   myprow, mypcol, p, q;
181   /* Compute the (myprow,mypcol) indices of processor mypnum in P0xQ0 We
182    * assume the row-major ordering of the BLACS */
183   Cblacs_gridinfo(a->ctxt, &p, &q, &myprow, &mypcol);
184   myprow = SHIFT(myprow, a->sprow, p);
185   mypcol = SHIFT(mypcol, a->spcol, q);
186   assert(myprow >= 0 && mypcol >= 0);
187   return localsize(myprow, p, a->nbrow, a->m) *
188 	localsize(mypcol, q, a->nbcol, a->n);
189 }
190 void
checkequal(ctxt,a)191 checkequal(ctxt, a)
192   int   a, ctxt;
193 {
194   int   np, dummy, nbrow, myp, b;
195   Cblacs_gridinfo(ctxt, &nbrow, &np, &dummy, &myp);
196   assert(nbrow == 1);
197   if (np == 1)
198     return;
199   if (myp == 0) {
200     Cigesd2d(ctxt, 1, 1, &a, 1, 0, 1);
201     Cigerv2d(ctxt, 1, 1, &b, 1, 0, np - 1);
202     assert(a == b);
203   } else {
204     Cigerv2d(ctxt, 1, 1, &b, 1, 0, myp - 1);
205     assert(a == b);
206     Cigesd2d(ctxt, 1, 1, &a, 1, 0, (myp + 1) % np);
207   }
208 }
209 void
paramcheck(a,i,j,m,n,p,q,gcontext)210 paramcheck(a, i, j, m, n, p, q, gcontext)
211   MDESC *a;
212   int   i, j, m, n, p, q;
213 {
214   int   p2, q2, myprow, mypcol;
215 #ifndef NDEBUG
216   checkequal(gcontext, p);
217   checkequal(gcontext, q);
218   checkequal(gcontext, a->sprow);
219   checkequal(gcontext, a->spcol);
220   checkequal(gcontext, a->m);
221   checkequal(gcontext, a->n);
222   checkequal(gcontext, i);
223   checkequal(gcontext, j);
224   checkequal(gcontext, a->nbrow);
225   checkequal(gcontext, a->nbcol);
226 #endif
227   Cblacs_gridinfo(a->ctxt, &p2, &q2, &myprow, &mypcol);
228   /* compatibility T3D, must check myprow  and mypcol are within bounds */
229   if (myprow >= p2 || mypcol >= q2)
230     myprow = mypcol = -1;
231   if ((myprow >= 0 || mypcol >= 0) && (p2 != p && q2 != q)) {
232     fprintf(stderr, "??MR2D:incoherent p,q parameters\n");
233     exit(1);
234   }
235   assert(myprow < p && mypcol < q);
236   if (a->sprow < 0 || a->sprow >= p || a->spcol < 0 || a->spcol >= q) {
237     fprintf(stderr, "??MR2D:Bad first processor coordinates\n");
238     exit(1);
239   }
240   if (i < 0 || j < 0 || i + m > a->m || j + n > a->n) {
241     fprintf(stderr, "??MR2D:Bad submatrix:i=%d,j=%d,\
242 m=%d,n=%d,M=%d,N=%d\n",
243 	    i, j, m, n, a->m, a->n);
244     exit(1);
245   }
246   if ((myprow >= 0 || mypcol >= 0) &&
247       localsize(SHIFT(myprow, a->sprow, p), p, a->nbrow, a->m) > a->lda) {
248     fprintf(stderr, "??MR2D:bad lda arg:row=%d,m=%d,p=%d,\
249 nbrow=%d,lda=%d,sprow=%d\n",
250 	    myprow, a->m, p, a->nbrow, a->lda, a->sprow);
251     exit(1);
252   }
253 }
254 /* to change from the submatrix beginning at line i to one beginning at line
255  * i' with i'< blocksize return the line number on the local process where
256  * the new matrix begin, the new process number, and i' */
257 int
changeorigin(myp,sp,p,bs,i,decal,newsp)258 changeorigin(myp, sp, p, bs, i, decal, newsp)
259   int   myp, sp, p, bs, i;
260   int  *decal, *newsp;
261 {
262   int   tempheight, firstblock, firsttemp;
263   /* we begin by changing the parameters so that ia < templatewidth,... */
264   tempheight = bs * p;
265   firsttemp = i / tempheight;
266   firstblock = (i / bs) % p;
267   *newsp = (sp + firstblock) % p;
268   if (myp >= 0)
269     *decal = firsttemp * bs + (SHIFT(myp, sp, p) < firstblock ? bs : 0);
270   else
271     *decal = 0;
272   return i % bs;
273 }
274 /******************************************************************/
275 /* Return the indice in local memory of element of indice a in the matrix */
276 int
localindice(ig,jg,templateheight,templatewidth,a)277 localindice(ig, jg, templateheight, templatewidth, a)
278   int   templateheight, templatewidth, ig, jg;
279   MDESC *a;
280 /* Return the indice in local memory (scattered distribution) of the element
281  * of indice a in global matrix */
282 {
283   int   vtemp, htemp, vsubtemp, hsubtemp, il, jl;
284   assert(ig >= 0 && ig < a->m && jg >= 0 && jg < a->n);
285   /* coordinates in global matrix with the tests in intersect, ig MUST BE in
286    * [0..m] and jg in [0..n] */
287   /* coordinates of the template that "owns" the element */
288   vtemp = ig / templateheight;
289   htemp = jg / templatewidth;
290   /* coordinates of the element in the subblock of the (vtemp, htemp)
291    * template */
292   vsubtemp = ig % a->nbrow;
293   hsubtemp = jg % a->nbcol;
294   /* coordinates of the element in the local block of the processor */
295   il = a->nbrow * vtemp + vsubtemp;
296   jl = a->nbcol * htemp + hsubtemp;
297   assert(il < a->lda);
298 #ifndef NDEBUG
299   {
300     int   pr, pc, p, q, lp, lq;
301     Cblacs_gridinfo(a->ctxt, &p, &q, &pr, &pc);
302     p = templateheight / a->nbrow;
303     q = templatewidth / a->nbcol;
304     lp = ig % templateheight / a->nbrow;
305     lq = jg % templatewidth / a->nbcol;
306     assert(lp == SHIFT(pr, a->sprow, p));
307     assert(lq == SHIFT(pc, a->spcol, q));
308   }
309 #endif
310   return (jl * a->lda + il);
311 }
312