1
2 #include <stdio.h>
3
4 #include "macdecls.h"
5 #include "ga.h"
6 #include "gp.h"
7 #define USE_HYPRE 0
8 #define IMAX 100
9 #define JMAX 100
10 #define KMAX 100
11 #define LMAX IMAX*JMAX*KMAX
12 #define MAXVEC 5000000
13 #define EPSLN 1.0d-10
14 #define CHECK_BOUND 1
15
16 #define bb_a(ib) bb_v(bb_i + (ib))
17 #define cc_a(ib) cc_v(cc_i + (ib))
18
19 #define Integer int
20 #define MAX_FACTOR 10000
21
22 #if USE_HYPRE
23 #include "HYPRE.h"
24 #include "HYPRE_struct_mv.h"
25 #include "mpi.h"
26 #endif
27
grid_factor(int p,int xdim,int ydim,int zdim,int * idx,int * idy,int * idz)28 void grid_factor(int p, int xdim, int ydim, int zdim, int *idx, int *idy, int *idz) {
29 int i, j;
30 int ip, ifac, pmax, prime[MAX_FACTOR];
31 int fac[MAX_FACTOR];
32 int ix, iy, iz, ichk;
33
34 i = 1;
35 /*
36 factor p completely
37 first, find all prime numbers, besides 1, less than or equal to
38 the square root of p
39 */
40 ip = (int)(sqrt((double)p))+1;
41 pmax = 0;
42 for (i=2; i<=ip; i++) {
43 ichk = 1;
44 for (j=0; j<pmax; j++) {
45 if (i%prime[j] == 0) {
46 ichk = 0;
47 break;
48 }
49 }
50 if (ichk) {
51 pmax = pmax + 1;
52 if (pmax > MAX_FACTOR) printf("Overflow in grid_factor\n");
53 prime[pmax-1] = i;
54 }
55 }
56 /*
57 find all prime factors of p
58 */
59 ip = p;
60 ifac = 0;
61 for (i=0; i<pmax; i++) {
62 while(ip%prime[i] == 0) {
63 ifac = ifac + 1;
64 fac[ifac-1] = prime[i];
65 ip = ip/prime[i];
66 }
67 }
68 /*
69 p is prime
70 */
71 if (ifac==0) {
72 ifac++;
73 fac[0] = p;
74 }
75
76 /*
77 determine three factors of p of approximately the
78 same size
79 */
80 *idx = 1;
81 *idy = 1;
82 *idz = 1;
83 for (i = ifac-1; i >= 0; i--) {
84 ix = xdim/(*idx);
85 iy = ydim/(*idy);
86 iz = zdim/(*idz);
87 if (ix >= iy && ix >= iz && ix > 1) {
88 *idx = fac[i]*(*idx);
89 } else if (iy >= ix && iy >= iz && iy > 1) {
90 *idy = fac[i]*(*idy);
91 } else if (iz >= ix && iz >= iy && iz > 1) {
92 *idz = fac[i]*(*idz);
93 } else {
94 printf("Too many processors in grid factoring routine\n");
95 }
96 }
97 }
98
99 /*
100 Short subroutine for multiplying sparse matrix block with vector segment
101 */
loc_matmul(double * a_mat,int * jvec,int * ivec,double * bvec,double * cvec,int nrows)102 void loc_matmul(double *a_mat, int *jvec, int *ivec,
103 double *bvec, double *cvec, int nrows) {
104 double tempc;
105 int i, j, jj, jmin,jmax;
106 for (i=0; i<nrows; i++) {
107 jmin = ivec[i];
108 jmax = ivec[i+1]-1;
109 tempc = 0.0;
110 for (j=jmin; j<jmax; j++) {
111 jj = jvec[j];
112 tempc = tempc + a_mat[j]*bvec[jj];
113 }
114 cvec[i] = cvec[i] + tempc;
115 }
116 }
117 /*
118 Random number generator
119 */
ran3(int * idum)120 double ran3(int *idum) {
121 static int iff = 0;
122 if (*idum < 0 || iff == 0) {
123 iff = 1;
124 srand(abs(*idum));
125 *idum = 1;
126 }
127 return ((double)rand())/((double)RAND_MAX);
128 }
129
130 /*
131 * A serial routine for transposing a matrix stored in a standard Compressed
132 * Sparse Row (CSR) format. The routine is based on compiling a linked list of
133 * the elements in each row and then using that to perform the transpose. The
134 * ordering of the matrix elements in the transposed matrix is not monotonic,
135 * that would require an additional sort on each row of the transposed matrix.
136 */
stran(int nrows,int ncols,int * i_in,int * j_in,int * val_in,int * i_out,int * j_out,int * val_out)137 void stran(int nrows, int ncols, int *i_in, int* j_in, int *val_in,
138 int *i_out, int *j_out, int *val_out)
139 {
140 int i, j, jj, jcnt;
141 int nnz, jmin, jmax;
142 int *header, *link, *jdx;
143
144 /* Create link list for all column entries */
145 nnz = i_in[nrows];
146 header = (int*)malloc(ncols*sizeof(int));
147 link = (int*)malloc(nnz*sizeof(int));
148 jdx = (int*)malloc(nnz*sizeof(int));
149 for (i=0; i<ncols; i++) header[i] = -1;
150 for (i=0; i<nnz; i++) link[i] = -1;
151 jcnt = 0;
152 for (i=0; i<nrows; i++) {
153 jmin = i_in[i];
154 jmax = i_in[i+1];
155 for (j=jmin; j<jmax; j++) {
156 jj = j_in[j];
157 link[jcnt] = header[jj];
158 jdx[jcnt] = i;
159 header[jj] = jcnt;
160 jcnt++;
161 }
162 }
163
164 /* Use link-list to evaluate transpose */
165 jj = 0;
166 for (i=0; i<ncols; i++) {
167 i_out[i] = jj;
168 j = header[i];
169 while (j != -1) {
170 j_out[jj] = jdx[j];
171 val_out[jj] = val_in[j];
172 j = link[j];
173 jj++;
174 }
175 }
176 i_out[ncols] = nnz;
177 free(jdx);
178 free(link);
179 free(header);
180 }
181
182 /* serial transpose code for real matrices */
stranr(int nrows,int ncols,int * i_in,int * j_in,double * val_in,int * i_out,int * j_out,double * val_out)183 void stranr(int nrows, int ncols, int *i_in, int* j_in, double *val_in,
184 int *i_out, int *j_out, double *val_out)
185 {
186 int i, j, jj, jcnt;
187 int nnz, jmin, jmax;
188 int *header, *link, *jdx;
189
190 /* Create link list for all column entries */
191 nnz = i_in[nrows];
192 header = (int*)malloc(ncols*sizeof(int));
193 link = (int*)malloc(nnz*sizeof(int));
194 jdx = (int*)malloc(nnz*sizeof(int));
195 for (i=0; i<ncols; i++) header[i] = -1;
196 for (i=0; i<nnz; i++) link[i] = -1;
197 jcnt = 0;
198 for (i=0; i<nrows; i++) {
199 jmin = i_in[i];
200 jmax = i_in[i+1];
201 for (j=jmin; j<jmax; j++) {
202 jj = j_in[j];
203 link[jcnt] = header[jj];
204 jdx[jcnt] = i;
205 header[jj] = jcnt;
206 jcnt++;
207 }
208 }
209
210 /* Use link-list to evaluate transpose */
211 jj = 0;
212 for (i=0; i<ncols; i++) {
213 i_out[i] = jj;
214 j = header[i];
215 while (j != -1) {
216 j_out[jj] = jdx[j];
217 val_out[jj] = val_in[j];
218 j = link[j];
219 jj++;
220 }
221 }
222 i_out[ncols] = nnz;
223 free(jdx);
224 free(link);
225 free(header);
226 }
227
228 /*
229 create a random sparse matrix in compressed row form corresponding to a
230 7-point stencil for a grid on a lattice of dimension idim X jdim X kdim grid
231 points
232 */
create_laplace_mat(int idim,int jdim,int kdim,int pdi,int pdj,int pdk,int * gp_block,int * g_j,int * g_i,int ** imapc)233 void create_laplace_mat(int idim, int jdim, int kdim, int pdi, int pdj, int pdk,
234 int *gp_block, int *g_j, int *g_i, int **imapc) {
235 /*
236 idim: i-dimension of grid
237 jdim: j-dimension of grid
238 kdim: k-dimension of grid
239 pdi: i-dimension of processor grid
240 pdj: j-dimension of processor grid
241 pdk: k-dimension of processor grid
242 ! g_data: global array of values
243 ! g_j: global array containing j indices (using local indices)
244 ! g_i: global array containing starting location of each row in g_j
245 ! (using local indices)
246 gp_block: global pointer array containing non-zero sparse sub-blocks of
247 matrix
248 g_j: global array containing j indices of sub-blocks
249 g_i: global array containing starting location of each row in g_j
250 tsize: total number of non-zero elements in matrix
251 imapc: map array for vectors
252 */
253 int ltotal_procs;
254 int *lproclist, *lproc_inv, *lvoffset, *lnsize, *loffset, *licnt, *limapc;
255 int *nnz_list;
256 int nnz, offset, b_nnz;
257 int nprocs, me, imin, imax, jcnt;
258 int *jmin, *jmax;
259 int ix, iy, iz, idx;
260 double x, dr;
261 double *rval, *gp_rval;
262 int isize, idbg;
263 int *jval, *gp_jval, *ival, *gp_ival, *ivalt;
264 int i, j, k, itmp, one, tlo, thi, ld;
265 int idum, ntot, indx, nghbrs[7], ncnt, nsave;
266 int ixn[7],iyn[7],izn[7], procid[7];
267 int status;
268 int lo[3], hi[3], ip, jp, kp, ldi, ldj, jdx, joff;
269 int il, jl, kl, ldmi, ldpi, ldmj, ldpj;
270 int *xld, *yld, *zld, *tmapc;
271 int *ecnt, *total_distr;
272 int total_max, toffset;
273 int *iparams, *blk_ptr;
274 int *iparamst, *jvalt;
275 double *rvalt;
276 FILE *fp, *fopen();
277
278 me = NGA_Nodeid();
279 nprocs = NGA_Nnodes();
280 idum = -(12345+me);
281 x = ran3(&idum);
282 one = 1;
283
284 if (me == 0) {
285 printf("\n Dimension of grid: \n\n");
286 printf(" I Dimension: %d\n",idim);
287 printf(" J Dimension: %d\n",jdim);
288 printf(" K Dimension: %d\n\n",kdim);
289 }
290 /*
291 Find position of processor in processor grid and calulate minimum
292 and maximum values of indices
293 */
294 i = me;
295 ip = i%pdi;
296 i = (i-ip)/pdi;
297 jp = i%pdj;
298 kp = (i-jp)/pdj;
299
300 lo[0] = (int)((((double)idim)*((double)ip))/((double)pdi));
301 if (ip < pdi-1) {
302 hi[0] = (int)((((double)idim)*((double)(ip+1)))/((double)pdi))-1;
303 } else {
304 hi[0] = idim - 1;
305 }
306
307 lo[1] = (int)((((double)jdim)*((double)jp))/((double)pdj));
308 if (jp < pdj-1) {
309 hi[1] = (int)((((double)jdim)*((double)(jp+1)))/((double)pdj))-1;
310 } else {
311 hi[1] = jdim - 1;
312 }
313
314 lo[2] = (int)((((double)kdim)*((double)kp))/((double)pdk));
315 if (kp < pdk-1) {
316 hi[2] = (int)((((double)kdim)*((double)(kp+1)))/((double)pdk))-1;
317 } else {
318 hi[2] = kdim - 1;
319 }
320
321 ldi = hi[0]-lo[0]+1;
322 ldj = hi[1]-lo[1]+1;
323
324 /* Evaluate xld, yld, zld. These contain the number of elements in each
325 division along the x, y, z axes */
326 xld = (int*)malloc(pdi*sizeof(int));
327 for (i=0; i<pdi; i++) {
328 if (i<pdi-1) {
329 xld[i] = (int)((((double)idim)*((double)(i+1)))/((double)pdi));
330 } else {
331 xld[i] = idim;
332 }
333 xld[i] = xld[i] - (int)((((double)idim)*((double)(i)))/((double)pdi));
334 }
335
336 yld = (int*)malloc(pdj*sizeof(int));
337 for (i=0; i<pdj; i++) {
338 if (i<pdj-1) {
339 yld[i] = (int)((((double)jdim)*((double)(i+1)))/((double)pdj));
340 } else {
341 yld[i] = jdim;
342 }
343 yld[i] = yld[i] - (int)((((double)jdim)*((double)(i)))/((double)pdj));
344 }
345
346 zld = (int*)malloc(pdk*sizeof(int));
347 for (i=0; i<pdk; i++) {
348 if (i<pdk-1) {
349 zld[i] = (int)((((double)kdim)*((double)(i+1)))/((double)pdk));
350 } else {
351 zld[i] = jdim;
352 }
353 zld[i] = zld[i] - (int)((((double)kdim)*((double)(i)))/((double)pdk));
354 }
355
356 /* Determine number of rows per processor
357 lnsize[i]: number of rows associated with process i
358 loffset[i]: global offset to location of first row associated
359 with process i */
360
361 lnsize = (int*)malloc(nprocs*sizeof(int));
362 loffset = (int*)malloc(nprocs*sizeof(int));
363 for (i=0; i<nprocs; i++) {
364 lnsize[i] = 0;
365 loffset[i] = 0;
366 }
367 lnsize[me] = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
368 NGA_Igop(lnsize,nprocs,"+");
369 loffset[0] = 0;
370 for (i=1; i<nprocs; i++) {
371 loffset[i] = loffset[i-1] + lnsize[i-1];
372 }
373
374 ntot = idim*jdim*kdim;
375 NGA_Sync();
376 /*
377 scan over rows of lattice
378 imin: minimum global index of rows associated with this process (me)
379 imax: maximum global index of rows associated with this process (me)
380 */
381 imin = loffset[me];
382 imax = loffset[me]+lnsize[me]-1;
383 free(loffset);
384 /*
385 find out how many other processors couple to this row of blocks
386 ecnt[i]: the number of columns on processor i that are coupled to this
387 process
388 */
389 ecnt = (int*)malloc(nprocs*sizeof(int));
390 for (i=0; i<nprocs; i++) {
391 ecnt[i] = 0;
392 }
393
394 for (i=imin; i<=imax; i++) {
395 /*
396 compute local indices of grid point corresponding to row i
397 */
398 indx = i - imin;
399 ix = indx%ldi;
400 indx = (indx - ix)/ldi;
401 iy = indx%ldj;
402 iz = (indx - iy)/ldj;
403 ix = ix + lo[0];
404 iy = iy + lo[1];
405 iz = iz + lo[2];
406
407 ecnt[me] = ecnt[me] + 1;
408 if (ix+1 <= idim-1) {
409 if (ix+1 > hi[0]) {
410 jdx = kp*pdi*pdj + jp*pdi + ip + 1;
411 ecnt[jdx] = ecnt[jdx] + 1;
412 } else {
413 ecnt[me] = ecnt[me] + 1;
414 }
415 }
416 if (ix-1 >= 0) {
417 if (ix-1 < lo[0]) {
418 jdx = kp*pdi*pdj + jp*pdi + ip - 1;
419 ecnt[jdx] = ecnt[jdx] + 1;
420 } else {
421 ecnt[me] = ecnt[me] + 1;
422 }
423 }
424 if (iy+1 <= jdim-1) {
425 if (iy+1 > hi[1]) {
426 jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
427 ecnt[jdx] = ecnt[jdx] + 1;
428 } else {
429 ecnt[me] = ecnt[me] + 1;
430 }
431 }
432 if (iy-1 >= 0) {
433 if (iy-1 < lo[1]) {
434 jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
435 ecnt[jdx] = ecnt[jdx] + 1;
436 } else {
437 ecnt[me] = ecnt[me] + 1;
438 }
439 }
440 if (iz+1 <= kdim-1) {
441 if (iz+1 > hi[2]) {
442 jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
443 ecnt[jdx] = ecnt[jdx] + 1;
444 } else {
445 ecnt[me] = ecnt[me] + 1;
446 }
447 }
448 if (iz-1 >= 0) {
449 if (iz-1 < lo[2]) {
450 jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
451 ecnt[jdx] = ecnt[jdx] + 1;
452 } else {
453 ecnt[me] = ecnt[me] + 1;
454 }
455 }
456 }
457
458 /* Create list of processors that this processor is coupled to.
459 If ecnt[i] is greater than zero then process i is coupled to this process.
460 ltotal_procs: the total number of other processor that this process is coupled
461 to. This includes this process (the diagonal term).
462 lproclist[i]: the IDs of the processor that this processor is coupled to
463 lproc_inv[i]: the location in lproclist of processor i. If processor i is not
464 coupled to this process, the lproc_inv[i] = -1
465 ncnt: total number of non-zero elements held by this process
466 nnz_list[i]: number of processes coupled to process i by sparse blocks
467 nnz: total number of sparse blocks */
468
469 ltotal_procs = 0;
470 ncnt = 0;
471 for (i=0; i<nprocs; i++) {
472 if (ecnt[i] > 0) {
473 ltotal_procs++;
474 ncnt += ecnt[i];
475 }
476 }
477 nsave = ncnt;
478
479 lproclist = (int*)malloc(ltotal_procs*sizeof(int));
480 lproc_inv = (int*)malloc(nprocs*sizeof(int));
481 licnt = (int*)malloc(ltotal_procs*sizeof(int));
482 for (i=0; i<ltotal_procs; i++) {
483 licnt[i] = 0;
484 }
485
486 rval = (double*)malloc(ncnt*sizeof(double));
487 idbg = ncnt;
488 jval = (int*)malloc(ncnt*sizeof(int));
489 ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
490 ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
491
492 for (i=0; i<ncnt; i++) {
493 rval[i] = 0.0;
494 jval[i] = 0;
495 }
496
497 j = (imax-imin+2)*ltotal_procs;
498 for (i=0; i<j; i++) {
499 ival[i] = 0;
500 ivalt[i] = 0;
501 }
502
503 nnz_list = (int*)malloc(nprocs*sizeof(int));
504 for (i=0; i<nprocs; i++) {
505 nnz_list[i] = 0;
506 }
507
508 /* nnz is total number of non-zero sparse blocks */
509 nnz_list[me] = ltotal_procs;
510 NGA_Igop(nnz_list, nprocs, "+");
511 nnz = 0;
512 for (i=0; i<nprocs; i++) {
513 nnz += nnz_list[i];
514 }
515
516 /* lvoffset[i]: local offset into array ival[i] to get to elements associated
517 with block i (i runs from 0 to ltotal_procs-1)
518 isize: number of rows (plus 1) that reside on this processor */
519 isize = (imax-imin+2);
520 for (i=0; i<nprocs; i++) {
521 lproc_inv[i] = -1;
522 }
523 lvoffset = (int*)malloc(ltotal_procs*sizeof(int));
524 lvoffset[0] = 0;
525 j = 0;
526 for (i=0; i<nprocs; i++) {
527 if (ecnt[i] > 0) {
528 lproclist[j] = i;
529 if (j > 0) {
530 lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1];
531 }
532 lproc_inv[i] = j;
533 j++;
534 }
535 }
536
537 /* Create arrays the hold the sparse block representation of the sparse matrix
538 gp_block[nnz]: Global Pointer array holding the sparse sub-matrices
539 g_j[nnz]: column block indices for the element in gp_block
540 g_i[nprocs]: row block indices for the elements in g_j */
541
542 tmapc = (int*)malloc((nprocs+1)*sizeof(int));
543 tmapc[0] = 0;
544 for (i=1; i<=nprocs; i++) {
545 tmapc[i] = tmapc[i-1]+nnz_list[i-1];
546 }
547 *gp_block = GP_Create_handle();
548 GP_Set_dimensions(*gp_block,one,&nnz);
549 GP_Set_irreg_distr(*gp_block, tmapc, &nprocs);
550 GP_Allocate(*gp_block);
551
552 *g_j = NGA_Create_handle();
553 NGA_Set_data(*g_j,one,&nnz,C_INT);
554 NGA_Set_irreg_distr(*g_j, tmapc, &nprocs);
555 NGA_Allocate(*g_j);
556
557 for (i=0; i<nprocs; i++) {
558 tmapc[i] = i;
559 }
560 *g_i = NGA_Create_handle();
561 i = nprocs+1;
562 NGA_Set_data(*g_i,one,&i,C_INT);
563 NGA_Set_irreg_distr(*g_i, tmapc, &nprocs);
564 NGA_Allocate(*g_i);
565 free(tmapc);
566
567 jmin = (int*)malloc(nprocs*sizeof(int));
568 jmax = (int*)malloc(nprocs*sizeof(int));
569 for (i=0; i<nprocs; i++) {
570 jmin[i] = 0;
571 jmax[i] = 0;
572 }
573 jmin[me] = imin;
574 jmax[me] = imax;
575 NGA_Igop(jmin, nprocs, "+");
576 NGA_Igop(jmax, nprocs, "+");
577
578 /*
579 Create the sparse blocks holding actual data. All the elements within each block
580 couple this processor to one other processor
581 rval[i]: values of matrix elements
582 jval[i]: column indices of matrix elements
583 ival[i]: index of first elements in rval and jval for the row represented by
584 the index i.
585 ivalt[i]: temporary array used in the construction of ival[i]
586 */
587 for (i=imin; i<=imax; i++) {
588 /*
589 compute local indices of grid point corresponding to row i
590 */
591 indx = i - imin;
592 ix = indx%ldi;
593 indx = (indx - ix)/ldi;
594 iy = indx%ldj;
595 iz = (indx - iy)/ldj;
596 ix = ix + lo[0];
597 iy = iy + lo[1];
598 iz = iz + lo[2];
599 /*
600 find locations of neighbors in 7-point stencil (if they are on the grid)
601 */
602 ncnt = 0;
603 ixn[ncnt] = ix;
604 iyn[ncnt] = iy;
605 izn[ncnt] = iz;
606 il = ix - lo[0];
607 jl = iy - lo[1];
608 kl = iz - lo[2];
609 idx = kl*ldi*ldj + jl*ldi + il;
610 nghbrs[ncnt] = idx;
611 procid[ncnt] = me;
612 if (ix+1 <= idim - 1) {
613 ncnt++;
614 ixn[ncnt] = ix + 1;
615 iyn[ncnt] = iy;
616 izn[ncnt] = iz;
617 if (ix+1 > hi[0]) {
618 jdx = kp*pdi*pdj + jp*pdi + ip + 1;
619 il = 0;
620 jl = iy - lo[1];
621 kl = iz - lo[2];
622 ldpi = xld[ip+1];
623 } else {
624 jdx = me;
625 il = ix - lo[0] + 1;
626 jl = iy - lo[1];
627 kl = iz - lo[2];
628 ldpi = ldi;
629 }
630 idx = kl*ldpi*ldj + jl*ldpi + il;
631 nghbrs[ncnt] = idx;
632 procid[ncnt] = jdx;
633 }
634 if (ix-1 >= 0) {
635 ncnt++;
636 ixn[ncnt] = ix - 1;
637 iyn[ncnt] = iy;
638 izn[ncnt] = iz;
639 if (ix-1 < lo[0]) {
640 jdx = kp*pdi*pdj + jp*pdi + ip - 1;
641 il = xld[ip-1] - 1;
642 jl = iy - lo[1];
643 kl = iz - lo[2];
644 ldmi = xld[ip-1];
645 } else {
646 jdx = me;
647 il = ix - lo[0] - 1;
648 jl = iy - lo[1];
649 kl = iz - lo[2];
650 ldmi = ldi;
651 }
652 idx = kl*ldmi*ldj + jl*ldmi + il;
653 nghbrs[ncnt] = idx;
654 procid[ncnt] = jdx;
655 }
656 if (iy+1 <= jdim-1) {
657 ncnt++;
658 ixn[ncnt] = ix;
659 iyn[ncnt] = iy + 1;
660 izn[ncnt] = iz;
661 if (iy+1 > hi[1]) {
662 jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
663 il = ix - lo[0];
664 jl = 0;
665 kl = iz - lo[2];
666 ldpj = yld[jp+1];
667 } else {
668 jdx = me;
669 il = ix - lo[0];
670 jl = iy - lo[1] + 1;
671 kl = iz - lo[2];
672 ldpj = ldj;
673 }
674 idx = kl*ldi*ldpj + jl*ldi + il;
675 nghbrs[ncnt] = idx;
676 procid[ncnt] = jdx;
677 }
678 if (iy-1 >= 0) {
679 ncnt++;
680 ixn[ncnt] = ix;
681 iyn[ncnt] = iy - 1;
682 izn[ncnt] = iz;
683 if (iy-1 < lo[1]) {
684 jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
685 il = ix - lo[0];
686 jl = yld[jp-1] - 1;
687 kl = iz - lo[2];
688 ldmj = yld[jp-1];
689 } else {
690 jdx = me;
691 il = ix - lo[0];
692 jl = iy - lo[1] - 1;
693 kl = iz - lo[2];
694 ldmj = ldj;
695 }
696 idx = kl*ldi*ldmj + jl*ldi + il;
697 nghbrs[ncnt] = idx;
698 procid[ncnt] = jdx;
699 }
700 if (iz+1 <= kdim-1) {
701 ncnt++;
702 ixn[ncnt] = ix;
703 iyn[ncnt] = iy;
704 izn[ncnt] = iz + 1;
705 if (iz+1 > hi[2]) {
706 jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
707 il = ix - lo[0];
708 jl = iy - lo[1];
709 kl = 0;
710 } else {
711 jdx = me;
712 il = ix - lo[0];
713 jl = iy - lo[1];
714 kl = iz - lo[2] + 1;
715 }
716 idx = kl*ldi*ldj + jl*ldi + il;
717 nghbrs[ncnt] = idx;
718 procid[ncnt] = jdx;
719 }
720 if (iz-1 >= 0) {
721 ncnt++;
722 ixn[ncnt] = ix;
723 iyn[ncnt] = iy;
724 izn[ncnt] = iz - 1;
725 if (iz-1 < lo[2]) {
726 jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
727 il = ix - lo[0];
728 jl = iy - lo[1];
729 kl = zld[kp-1] - 1;
730 } else {
731 jdx = me;
732 il = ix - lo[0];
733 jl = iy - lo[1];
734 kl = iz - lo[2] - 1;
735 }
736 idx = kl*ldi*ldj + jl*ldi + il;
737 nghbrs[ncnt] = idx;
738 procid[ncnt] = jdx;
739 }
740 /*
741 sort indices so that neighbors run from lowest to highest local index. This sort
742 is not particularly efficient but ncnt is generally small
743 */
744 ncnt++;
745 for (j=0; j<ncnt; j++) {
746 for (k=j+1; k<ncnt; k++) {
747 if (nghbrs[j] > nghbrs[k]) {
748 itmp = nghbrs[j];
749 nghbrs[j] = nghbrs[k];
750 nghbrs[k] = itmp;
751 itmp = ixn[j];
752 ixn[j] = ixn[k];
753 ixn[k] = itmp;
754 itmp = iyn[j];
755 iyn[j] = iyn[k];
756 iyn[k] = itmp;
757 itmp = izn[j];
758 izn[j] = izn[k];
759 izn[k] = itmp;
760 itmp = procid[j];
761 procid[j] = procid[k];
762 procid[k] = itmp;
763 }
764 }
765 }
766 for (k=0; k<ncnt; k++) {
767 if (nghbrs[k] < 0 || nghbrs[k] >= ntot) {
768 printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]);
769 }
770 }
771
772 /* set weights corresponding to a finite difference Laplacian on a 7-point
773 stencil */
774
775 for (j=0; j<ncnt; j++) {
776 jdx = procid[j];
777 idx = lproc_inv[jdx];
778 if (ix == ixn[j] && iy == iyn[j] && iz == izn[j]) {
779 rval[lvoffset[idx]+licnt[idx]] = 6.0;
780 } else {
781 rval[lvoffset[idx]+licnt[idx]] = -1.0;
782 }
783 if (lvoffset[idx]+licnt[idx] < 0 || lvoffset[idx]+licnt[idx] >= nsave) {
784 printf("p[%d] Out of bounds (lvoffset+licnt)[%d]: %d\n",me,idx,lvoffset[idx]+licnt[idx]);
785 }
786 if (lvoffset[idx]+licnt[idx]>=idbg) {
787 }
788 /* TODO: Check this carefully */
789 jval[lvoffset[idx]+licnt[idx]] = nghbrs[j];
790 ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1;
791 licnt[idx]++;
792 }
793 }
794
795 /* finish evaluating ival array */
796
797 for (i=0; i<ltotal_procs; i++) {
798 ival[i*isize] = lvoffset[i];
799 for (j=1; j<isize; j++) {
800 ival[i*isize+j] = ival[i*isize+j-1] + ivalt[i*isize+j-1];
801 }
802 }
803 isize = 0;
804 for (i=0; i<ltotal_procs; i++) {
805 isize = isize + licnt[i];
806 }
807 if (isize > MAXVEC)
808 NGA_Error("ISIZE exceeds MAXVEC in local arrays ",isize);
809
810 /* Local portion of sparse matrix has been evaluated and decomposed into blocks
811 that match partitioning of right hand side across processors. The following
812 data is available at this point:
813 1) ltotal_procs: the number of processors that are coupled to this one via
814 the sparse matrix
815 2) lproclist(ltotal_procs): a list of processor IDs that are coupled to
816 this processor
817 3) lproc_inv(nprocs): The entry in proc_list that corresponds to a given
818 processor. If the entry is -1 then that processor does not couple to
819 this processor.
820 4) licnt(ltotal_procs): The number of non-zero entries in the sparse matrix
821 that couple the process represented by proc_list(j) to this process
822 5) lvoffset(ltotal_procs): The offsets for the non-zero data in the arrays
823 rval and jval for the blocks that couple this processor to other
824 processes in proc_list
825 6) offset(nprocs): the offset array for the distributed right hand side
826 vector
827 These arrays describe how the sparse matrix is layed out both locally and
828 across processors. In addition, the actual data for the distributed sparse
829 matrix is found in the following arrays:
830 1) rval: values of matrix for all blocks on this processor
831 2) jval: j-indices of matrix for all blocks on this processor
832 3) ival(ltotal_procs*(lnsize(me)+1)): starting index in rval and
833 jval for each row in each block */
834
835 NGA_Sync();
836
837 /* Create a sparse array of sparse blocks.
838 Each block element is divided into for sections.
839 The first section consists of 7 ints and contains the parameters
840 imin: minimum i index represented by block
841 imin: maximum i index represented by block
842 jmin: minimum j index represented by block
843 jmin: maximum j index represented by block
844 iblock: row index of block
845 jblock: column index of block
846 nnz: number of non-zero elements in block
847 The next section consists of nnz doubles that represent the non-zero values
848 in the block. The third section consists of nnz ints and contains the local
849 j indices of all values. The final section consists of (imax-imin+2) ints
850 and contains the starting index in jval and rval for the each row between
851 imin and imax. An extra value is included at the end and is set equal to
852 nnz+1. This is included to simplify some coding.
853 */
854
855 offset = 0;
856 for (i=0; i<me; i++) {
857 offset += nnz_list[i];
858 }
859 NGA_Put(*g_i, &me, &me, &offset, &one);
860 if (me==nprocs-1) {
861 NGA_Put(*g_i, &nprocs, &nprocs, &nnz, &one);
862 }
863 NGA_Sync();
864 for (i = 0; i<ltotal_procs; i++) {
865 /* evaluate total size of block */
866 b_nnz = ecnt[lproclist[i]];
867 isize = 7*sizeof(int) + b_nnz*(sizeof(double)+sizeof(int))
868 + (imax-imin+2)*sizeof(int);
869 blk_ptr = (int*)GP_Malloc(isize);
870
871 iparams = blk_ptr;
872 gp_rval = (double*)(iparams+7);
873 gp_jval = (int*)(gp_rval+b_nnz);
874 gp_ival = (gp_jval+b_nnz);
875
876 iparams[0] = imin;
877 iparams[1] = imax;
878 iparams[2] = jmin[lproclist[i]];
879 iparams[3] = jmax[lproclist[i]];
880 iparams[4] = me;
881 iparams[5] = lproclist[i];
882 iparams[6] = b_nnz;
883
884 ldj = (imax-imin+2);
885 k = 0;
886 toffset = lvoffset[i];
887 for (j=0; j<b_nnz; j++) {
888 gp_jval[j] = jval[toffset+j];
889 gp_rval[j] = rval[toffset+j];
890 }
891
892 toffset = ival[i*ldj];
893 for (k=0; k<ldj; k++) {
894 gp_ival[k] = ival[i*ldj+k]-toffset;
895 }
896
897 /* Assign blk_ptr to GP array element */
898 GP_Assign_local_element(*gp_block, &offset, (void*)blk_ptr, isize);
899 j = 1;
900 NGA_Put(*g_j,&offset,&offset,&lproclist[i],&j);
901 offset++;
902 }
903 NGA_Sync();
904
905 tmapc = (int*)malloc(nprocs*sizeof(int));
906 tmapc[0] = 0;
907 for (i=1; i<nprocs; i++) {
908 tmapc[i] = tmapc[i-1] + lnsize[i-1];
909 }
910 i = nprocs-1;
911 *imapc = tmapc;
912
913 free(rval);
914 free(jval);
915 free(ival);
916 free(ivalt);
917 free(xld);
918 free(yld);
919 free(zld);
920 free(lnsize);
921 free(lvoffset);
922 free(ecnt);
923 free(licnt);
924 free(lproclist);
925 free(lproc_inv);
926 free(jmin);
927 free(jmax);
928 free(nnz_list);
929 return;
930 }
931
main(int argc,char ** argv)932 int main(int argc, char **argv) {
933 int nmax, nprocs, me, me_plus;
934 int g_a_data, g_a_i, g_a_j, isize;
935 int gt_a_data, gt_a_i, gt_a_j;
936 int g_b, g_c;
937 int i, j, jj, k, one, jcnt;
938 int chunk, kp1, ld;
939 int *p_i, *p_j;
940 double *p_data, *p_b, *p_c;
941 double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
942 double t_hypre_strct, t_ga_trans, t_gp_get;
943 double t_get_blk_csr, t_trans_blk_csr, t_trans_blk, t_create_csr_ga, t_beg3;
944 double t_gp_tget, t_gp_malloc, t_gp_assign, t_beg4;
945 double prdot, dotga, dothypre, tempc;
946 double prtot, gatot, hypretot, gatot2, hypretot2;
947 double prdot2, prtot2;
948 int status;
949 int idim, jdim, kdim, idum, memsize;
950 int lsize, ntot;
951 int heap=200000, fudge=100, stack=200000, ma_heap;
952 double *cbuf, *vector;
953 int pdi, pdj, pdk, ip, jp, kp, ncells;
954 int lo[3],hi[3];
955 int blo[3], bhi[3];
956 int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
957 int iproc, iblock, btot;
958 double *amat, *bvec;
959 int *ivec, *jvec;
960 int *proclist, *proc_inv, *icnt;
961 int *voffset, *offset, *mapc;
962 int iloop, lo_bl, hi_bl;
963 char *buf, **buf_ptr;
964 int *iparams, *jval, *ival;
965 double *rval, *rvalt;
966 int imin, imax, jmin, jmax, irow, icol, nnz;
967 int nrows, kmin, kmax, lmin, lmax, jdx;
968 int LOOPNUM = 100;
969 void **blk_ptr;
970 void *blk;
971 int blk_size, tsize, zero;
972 int *iblk, *jblk, *blkidx;
973 int *tblk_ptr;
974 int *ivalt, *jvalt, *iparamst;
975 int *iblk_t, *jblk_t, *blkidx_t;
976 /*
977 Hypre declarations
978 */
979 int ierr;
980 #if USE_HYPRE
981 HYPRE_StructGrid grid;
982 HYPRE_StructStencil stencil;
983 HYPRE_StructMatrix matrix;
984 HYPRE_StructVector vec_x, vec_y;
985 int i4, j4, ndim, nelems, offsets[7][3];
986 int stencil_indices[7], hlo[3], hhi[3];
987 double weights[7];
988 double *values;
989 double alpha, beta;
990 int *rows, *cols;
991 #endif
992 /*
993 *** Intitialize a message passing library
994 */
995 zero = 0;
996 one = 1;
997 ierr = MPI_Init(&argc, &argv);
998 /*
999 *** Initialize GA
1000
1001 There are 2 choices: ga_initialize or ga_initialize_ltd.
1002 In the first case, there is no explicit limit on memory usage.
1003 In the second, user can set limit (per processor) in bytes.
1004 */
1005 t_beg = GA_Wtime();
1006 NGA_Initialize();
1007 GP_Initialize();
1008 t_ga_in = GA_Wtime() - t_beg;
1009 NGA_Dgop(&t_ga_in,one,"+");
1010
1011 t_ga_tot = 0.0;
1012 t_ga_trans = 0.0;
1013 t_get_blk_csr = 0.0;
1014 t_create_csr_ga = 0.0;
1015 t_trans_blk_csr = 0.0;
1016 t_trans_blk = 0.0;
1017 t_gp_get = 0.0;
1018 t_gp_malloc = 0.0;
1019 t_gp_assign = 0.0;
1020 t_mult = 0.0;
1021 t_get = 0.0;
1022 t_gp_tget = 0.0;
1023 t_hypre_strct = 0.0;
1024 prtot = 0.0;
1025 prtot2 = 0.0;
1026 gatot = 0.0;
1027 hypretot = 0.0;
1028
1029 me = NGA_Nodeid();
1030 me_plus = me + 1;
1031 nprocs = NGA_Nnodes();
1032 if (me == 0) {
1033 printf("Time to initialize GA: %12.4f\n",
1034 t_ga_in/((double)nprocs));
1035 }
1036 /*
1037 we can also use GA_set_memory_limit BEFORE first ga_create call
1038 */
1039 ma_heap = heap + fudge;
1040 /* call GA_set_memory_limit(util_mdtob(ma_heap)) */
1041
1042 if (me == 0) {
1043 printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
1044 }
1045 /*
1046 *** Initialize the MA package
1047 MA must be initialized before any global array is allocated
1048 */
1049 if (!MA_init(MT_DBL, stack, ma_heap)) NGA_Error("ma_init failed",-1);
1050 /*
1051 create a sparse LMAX x LMAX matrix and two vectors of length
1052 LMAX. The matrix is stored in compressed row format.
1053 One of the vectors is filled with random data and the other
1054 is filled with zeros.
1055 */
1056 idim = IMAX;
1057 jdim = JMAX;
1058 kdim = KMAX;
1059 ntot = idim*jdim*kdim;
1060 if (me == 0) {
1061 printf("\nDimension of matrix: %d\n\n",ntot);
1062 }
1063 t_beg = GA_Wtime();
1064 grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
1065 if (me == 0) {
1066 printf("\nProcessor grid configuration\n");
1067 printf(" PDX: %d\n",pdi);
1068 printf(" PDY: %d\n",pdj);
1069 printf(" PDZ: %d\n\n",pdk);
1070 printf(" Number of Loops: %d\n",LOOPNUM);
1071 }
1072
1073 create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,&g_a_data,&g_a_j,&g_a_i,&mapc);
1074 t_cnstrct = GA_Wtime() - t_beg;
1075
1076 g_b = NGA_Create_handle();
1077 NGA_Set_data(g_b,one,&ntot,MT_DBL);
1078 NGA_Set_irreg_distr(g_b,mapc,&nprocs);
1079 status = NGA_Allocate(g_b);
1080 /*
1081 fill g_b with random values
1082 */
1083 NGA_Distribution(g_b,me,blo,bhi);
1084 NGA_Access(g_b,blo,bhi,&p_b,&ld);
1085 ld = bhi[0]-blo[0]+1;
1086 btot = ld;
1087 vector = (double*)malloc(ld*sizeof(double));
1088 for (i=0; i<ld; i++) {
1089 idum = 0;
1090 p_b[i] = ran3(&idum);
1091 vector[i] = p_b[i];
1092 }
1093 NGA_Release(g_b,blo,bhi);
1094 NGA_Sync();
1095
1096 g_c = NGA_Create_handle();
1097 NGA_Set_data(g_c,one,&ntot,MT_DBL);
1098 NGA_Set_irreg_distr(g_c,mapc,&nprocs);
1099 status = NGA_Allocate(g_c);
1100 NGA_Zero(g_c);
1101 #if USE_HYPRE
1102 /*
1103 Assemble HYPRE grid and use that to create matrix. Start by creating
1104 grid partition
1105 */
1106 ndim = 3;
1107 i = me;
1108 ip = i%pdi;
1109 i = (i-ip)/pdi;
1110 jp = i%pdj;
1111 kp = (i-jp)/pdj;
1112 lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
1113 if (ip < pdi-1) {
1114 hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
1115 } else {
1116 hi[0] = idim - 1;
1117 }
1118 lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
1119 if (jp < pdj-1) {
1120 hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
1121 } else {
1122 hi[1] = jdim - 1;
1123 }
1124 lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
1125 if (kp < pdk-1) {
1126 hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
1127 } else {
1128 hi[2] = kdim - 1;
1129 }
1130 /*
1131 Create grid
1132 */
1133 hlo[0] = lo[0];
1134 hlo[1] = lo[1];
1135 hlo[2] = lo[2];
1136 hhi[0] = hi[0];
1137 hhi[1] = hi[1];
1138 hhi[2] = hi[2];
1139 ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
1140 ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
1141 ierr = HYPRE_StructGridAssemble(grid);
1142 /*
1143 Create stencil
1144 */
1145 offsets[0][0] = 0;
1146 offsets[0][1] = 0;
1147 offsets[0][2] = 0;
1148
1149 offsets[1][0] = 1;
1150 offsets[1][1] = 0;
1151 offsets[1][2] = 0;
1152
1153 offsets[2][0] = 0;
1154 offsets[2][1] = 1;
1155 offsets[2][2] = 0;
1156
1157 offsets[3][0] = 0;
1158 offsets[3][1] = 0;
1159 offsets[3][2] = 1;
1160
1161 offsets[4][0] = -1;
1162 offsets[4][1] = 0;
1163 offsets[4][2] = 0;
1164
1165 offsets[5][0] = 0;
1166 offsets[5][1] = -1;
1167 offsets[5][2] = 0;
1168
1169 offsets[6][0] = 0;
1170 offsets[6][1] = 0;
1171 offsets[6][2] = -1;
1172
1173 nelems = 7;
1174 ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
1175 for (i=0; i<nelems; i++) {
1176 ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
1177 }
1178
1179 ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1180 jcnt = 7*ncells;
1181 values = (double*)malloc(jcnt*sizeof(double));
1182 jcnt = 0;
1183 weights[0] = 6.0;
1184 weights[1] = -1.0;
1185 weights[2] = -1.0;
1186 weights[3] = -1.0;
1187 weights[4] = -1.0;
1188 weights[5] = -1.0;
1189 weights[6] = -1.0;
1190 for (i=0; i<ncells; i++) {
1191 for (j=0; j<7; j++) {
1192 values[jcnt] = weights[j];
1193 jcnt++;
1194 }
1195 }
1196
1197 ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
1198 ierr = HYPRE_StructMatrixInitialize(matrix);
1199 for (i=0; i<7; i++) {
1200 stencil_indices[i] = i;
1201 }
1202 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
1203 free(values);
1204 /*
1205 Check all six sides of current box to see if any are boundaries.
1206 Set values to zero if they are.
1207 */
1208 if (hi[0] == idim-1) {
1209 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1210 hlo[0] = idim-1;
1211 hhi[0] = idim-1;
1212 hlo[1] = lo[1];
1213 hhi[1] = hi[1];
1214 hlo[2] = lo[2];
1215 hhi[2] = hi[2];
1216 values = (double*)malloc(ncells*sizeof(double));
1217 for (i=0; i<ncells; i++) values[i] = 0.0;
1218 i4 = 1;
1219 j4 = 1;
1220 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1221 free(values);
1222 }
1223 if (hi[1] == jdim-1) {
1224 ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1225 hlo[0] = lo[0];
1226 hhi[0] = hi[0];
1227 hlo[1] = jdim-1;
1228 hhi[1] = jdim-1;
1229 hlo[2] = lo[2];
1230 hhi[2] = hi[2];
1231 values = (double*)malloc(ncells*sizeof(double));
1232 for (i=0; i<ncells; i++) values[i] = 0.0;
1233 i4 = 1;
1234 j4 = 2;
1235 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1236 free(values);
1237 }
1238 if (hi[2] == kdim-1) {
1239 ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
1240 hlo[0] = lo[0];
1241 hhi[0] = hi[0];
1242 hlo[1] = lo[1];
1243 hhi[1] = hi[1];
1244 hlo[2] = kdim-1;
1245 hhi[2] = kdim-1;
1246 values = (double*)malloc(ncells*sizeof(double));
1247 for (i=0; i<ncells; i++) values[i] = 0.0;
1248 i4 = 1;
1249 j4 = 3;
1250 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1251 free(values);
1252 }
1253 if (lo[0] == 0) {
1254 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1255 hlo[0] = 0;
1256 hhi[0] = 0;
1257 hlo[1] = lo[1];
1258 hhi[1] = hi[1];
1259 hlo[2] = lo[2];
1260 hhi[2] = hi[2];
1261 values = (double*)malloc(ncells*sizeof(double));
1262 for (i=0; i<ncells; i++) values[i] = 0.0;
1263 i4 = 1;
1264 j4 = 4;
1265 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1266 free(values);
1267 }
1268 if (lo[1] == 0) {
1269 ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1270 hlo[0] = lo[0];
1271 hhi[0] = hi[0];
1272 hlo[1] = 0;
1273 hhi[1] = 0;
1274 hlo[2] = lo[2];
1275 hhi[2] = hi[2];
1276 values = (double*)malloc(ncells*sizeof(double));
1277 for (i=0; i<ncells; i++) values[i] = 0.0;
1278 i4 = 1;
1279 j4 = 5;
1280 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1281 free(values);
1282 }
1283 if (lo[2] == 1) {
1284 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1285 hlo[0] = lo[0];
1286 hhi[0] = hi[0];
1287 hlo[1] = lo[1];
1288 hhi[1] = hi[1];
1289 hlo[2] = 0;
1290 hhi[2] = 0;
1291 values = (double*)malloc(ncells*sizeof(double));
1292 for (i=0; i<ncells; i++) values[i] = 0.0;
1293 i4 = 1;
1294 j4 = 6;
1295 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1296 free(values);
1297 }
1298 ierr = HYPRE_StructMatrixAssemble(matrix);
1299 /*
1300 Create vectors for matrix-vector multiply
1301 */
1302 ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
1303 ierr = HYPRE_StructVectorInitialize(vec_x);
1304 hlo[0] = lo[0];
1305 hlo[1] = lo[1];
1306 hlo[2] = lo[2];
1307 hhi[0] = hi[0];
1308 hhi[1] = hi[1];
1309 hhi[2] = hi[2];
1310 ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
1311 ierr = HYPRE_StructVectorAssemble(vec_x);
1312 NGA_Distribution(g_a_i,me,blo,bhi);
1313
1314 if (bhi[1] > ntot-1) {
1315 bhi[1] = ntot-1;
1316 }
1317
1318 btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1319
1320 for (i=0; i<btot; i++) vector[i] = 0.0;
1321 hlo[0] = lo[0];
1322 hlo[1] = lo[1];
1323 hlo[2] = lo[2];
1324 hhi[0] = hi[0];
1325 hhi[1] = hi[1];
1326 hhi[2] = hi[2];
1327 ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);
1328
1329 for (i=0; i<btot; i++) vector[i] = 0.0;
1330 ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
1331 ierr = HYPRE_StructVectorInitialize(vec_y);
1332 ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
1333 ierr = HYPRE_StructVectorAssemble(vec_y);
1334 #endif
1335 /* Multiply sparse matrix. Start by accessing pointers to local portions of
1336 g_a_data, g_a_j, g_a_i */
1337
1338 NGA_Sync();
1339 for (iloop=0; iloop<LOOPNUM; iloop++) {
1340 t_beg2 = GA_Wtime();
1341
1342 NGA_Distribution(g_c,me,blo,bhi);
1343 NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
1344 for (i = 0; i<bhi[0]-blo[0]+1; i++) {
1345 p_c[i] = 0.0;
1346 }
1347
1348 /* get number of matrix blocks coupled to this process */
1349 NGA_Get(g_a_i,&me,&me,&lo_bl,&one);
1350 #if 1
1351 NGA_Get(g_a_i,&me_plus,&me_plus,&hi_bl,&one);
1352 hi_bl--;
1353 total_procs = hi_bl - lo_bl + 1;
1354 blk_ptr = (void**)malloc(sizeof(void*));
1355 /* Loop through matrix blocks */
1356 ioff = 0;
1357 for (iblock = 0; iblock<total_procs; iblock++) {
1358 t_beg = GA_Wtime();
1359 jdx = lo_bl+iblock;
1360 #if 0
1361 GP_Access_element(g_a_data, &jdx, &blk_ptr[0], &isize);
1362 #endif
1363 #if 1
1364 GP_Get_size(g_a_data, &jdx, &jdx, &isize);
1365 #endif
1366 blk = (void*)malloc(isize);
1367 #if 1
1368 GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0);
1369 #endif
1370 t_gp_get = t_gp_get + GA_Wtime() - t_beg;
1371 iparams = (int*)blk_ptr[0];
1372 rval = (double*)(iparams+7);
1373 imin = iparams[0];
1374 imax = iparams[1];
1375 jmin = iparams[2];
1376 jmax = iparams[3];
1377 irow = iparams[4];
1378 icol = iparams[5];
1379 nnz = iparams[6];
1380 jval = (int*)(rval+nnz);
1381 ival = (int*)(jval+nnz);
1382 nrows = imax - imin + 1;
1383 bvec = (double*)malloc((jmax-jmin+1)*sizeof(double));
1384 j = 0;
1385 t_beg = GA_Wtime();
1386 NGA_Get(g_b,&jmin,&jmax,bvec,&j);
1387 t_get = t_get + GA_Wtime() - t_beg;
1388 t_beg = GA_Wtime();
1389 for (i=0; i<nrows; i++) {
1390 kmin = ival[i];
1391 kmax = ival[i+1]-1;
1392 tempc = 0.0;
1393 for (j = kmin; j<=kmax; j++) {
1394 jj = jval[j];
1395 tempc = tempc + rval[j]*bvec[jj];
1396 }
1397 p_c[i] = p_c[i] + tempc;
1398 }
1399 t_mult = t_mult + GA_Wtime() - t_beg;
1400 free(bvec);
1401 free(blk);
1402 }
1403 NGA_Sync();
1404 t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;
1405
1406 NGA_Distribution(g_c,me,blo,bhi);
1407 NGA_Release(g_c,blo,bhi);
1408
1409 #if USE_HYPRE
1410 alpha = 1.0;
1411 beta = 0.0;
1412 t_beg = GA_Wtime();
1413 ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1414 t_hypre_strct = t_hypre_strct + GA_Wtime() - t_beg;
1415 hlo[0] = lo[0];
1416 hlo[1] = lo[1];
1417 hlo[2] = lo[2];
1418 hhi[0] = hi[0];
1419 hhi[1] = hi[1];
1420 hhi[2] = hi[2];
1421 ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1422 NGA_Distribution(g_c,me,hlo,hhi);
1423 cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
1424 NGA_Get(g_c,hlo,hhi,cbuf,&one);
1425 prdot = 0.0;
1426 dotga = 0.0;
1427 dothypre = 0.0;
1428 for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
1429 dothypre = dothypre + vector[i]*vector[i];
1430 dotga = dotga + cbuf[i]*cbuf[i];
1431 prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1432 }
1433 NGA_Dgop(&dotga,1,"+");
1434 NGA_Dgop(&dothypre,1,"+");
1435 NGA_Dgop(&prdot,1,"+");
1436 gatot += sqrt(dotga);
1437 hypretot += sqrt(dothypre);
1438 prtot += sqrt(prdot);
1439 free(cbuf);
1440 #endif
1441
1442 /* Transpose matrix. Start by making local copies of ival and jval arrays for
1443 the sparse matrix of blocks stored in the GP array */
1444 #if 1
1445 t_beg2 = GA_Wtime();
1446 t_beg3 = GA_Wtime();
1447 iblk = (int*)malloc((nprocs+1)*sizeof(int));
1448 iblk_t = (int*)malloc((nprocs+1)*sizeof(int));
1449 #if 0
1450 NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
1451 #else
1452 if (me == 0) {
1453 NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
1454 } else {
1455 for (i=0; i<nprocs+1; i++) {
1456 iblk[i] = 0;
1457 }
1458 }
1459 GA_Igop(iblk,nprocs+1,"+");
1460 #endif
1461 jblk = (int*)malloc(iblk[nprocs]*sizeof(int));
1462 jblk_t = (int*)malloc(iblk[nprocs]*sizeof(int));
1463 iblock = iblk[nprocs]-1;
1464 #if 0
1465 NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
1466 #else
1467 if (me == 0) {
1468 NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
1469 } else {
1470 for (i=0; i<iblock+1; i++) {
1471 jblk[i] = 0;
1472 }
1473 }
1474 GA_Igop(jblk,iblock+1,"+");
1475 #endif
1476 iblock++;
1477 blkidx = (int*)malloc(iblk[nprocs]*sizeof(int));
1478 blkidx_t = (int*)malloc(iblk[nprocs]*sizeof(int));
1479 for (i=0; i<iblock; i++) {
1480 blkidx[i] = i;
1481 }
1482 iblock = nprocs;
1483 t_get_blk_csr = t_get_blk_csr + GA_Wtime() - t_beg3;
1484 t_beg3 = GA_Wtime();
1485 stran(iblock, iblock, iblk, jblk, blkidx, iblk_t, jblk_t, blkidx_t);
1486 t_trans_blk_csr = t_trans_blk_csr + GA_Wtime() - t_beg3;
1487 t_beg3 = GA_Wtime();
1488 gt_a_data = GP_Create_handle();
1489 i = iblk_t[nprocs];
1490 GP_Set_dimensions(gt_a_data, one, &i);
1491 GP_Set_irreg_distr(gt_a_data, iblk_t, &nprocs);
1492 GP_Allocate(gt_a_data);
1493
1494 gt_a_j = NGA_Create_handle();
1495 i = iblk_t[nprocs];
1496 NGA_Set_data(gt_a_j, one, &i, C_INT);
1497 NGA_Set_irreg_distr(gt_a_j, iblk_t, &nprocs);
1498 NGA_Allocate(gt_a_j);
1499
1500 gt_a_i = NGA_Create_handle();
1501 i = nprocs+1;
1502 NGA_Set_data(gt_a_i,one,&i,C_INT);
1503 for (i=0; i<nprocs; i++) mapc[i] = i;
1504 NGA_Set_irreg_distr(gt_a_i, mapc, &nprocs);
1505 NGA_Allocate(gt_a_i);
1506
1507 /* copy i and j arrays of transposed matrix into distributed arrays */
1508 if (me==0) {
1509 lo_bl = 0;
1510 hi_bl = nprocs;
1511 NGA_Put(gt_a_i,&lo_bl,&hi_bl,iblk_t,&one);
1512 lo_bl = 0;
1513 hi_bl = iblk_t[nprocs]-1;
1514 NGA_Put(gt_a_j,&lo_bl,&hi_bl,jblk_t,&one);
1515 }
1516 NGA_Sync();
1517 lo_bl = iblk[me];
1518 hi_bl = iblk[me+1];
1519 total_procs = hi_bl - lo_bl + 1;
1520 total_procs = hi_bl - lo_bl;
1521 t_create_csr_ga = t_create_csr_ga + GA_Wtime() - t_beg3;
1522 for (iblock = lo_bl; iblock < hi_bl; iblock++) {
1523 t_beg4 = GA_Wtime();
1524 jdx = blkidx_t[iblock];
1525 GP_Get_size(g_a_data, &jdx, &jdx, &isize);
1526 blk = (void*)malloc(isize);
1527 GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0);
1528 /* Parameters for original block */
1529 iparams = (int*)blk_ptr[0];
1530 rval = (double*)(iparams+7);
1531 imin = iparams[0];
1532 imax = iparams[1];
1533 jmin = iparams[2];
1534 jmax = iparams[3];
1535 irow = iparams[4];
1536 icol = iparams[5];
1537 nnz = iparams[6];
1538 jval = (int*)(rval+nnz);
1539 ival = (int*)(jval+nnz);
1540
1541 /* Create transposed block */
1542 isize = 7*sizeof(int) + nnz*(sizeof(double)+sizeof(int))
1543 + (jmax-jmin+2)*sizeof(int);
1544 t_gp_tget = t_gp_tget + GA_Wtime() - t_beg4;
1545 t_beg4 = GA_Wtime();
1546 tblk_ptr = (int*)GP_Malloc(isize);
1547 t_gp_malloc = t_gp_malloc + GA_Wtime() - t_beg4;
1548 t_beg3 = GA_Wtime();
1549 iparamst = (int*)tblk_ptr;
1550 rvalt = (double*)(iparamst+7);
1551 jvalt = (int*)(rvalt+nnz);
1552 ivalt = (int*)(jvalt+nnz);
1553 iparamst[0] = jmin;
1554 iparamst[1] = jmax;
1555 iparamst[2] = imin;
1556 iparamst[3] = imax;
1557 iparamst[4] = icol;
1558 iparamst[5] = irow;
1559 iparamst[6] = nnz;
1560 i = imax-imin+1;
1561 j = jmax-jmin+1;
1562 stranr(i, j, ival, jval, rval, ivalt, jvalt, rvalt);
1563 t_trans_blk = t_trans_blk + GA_Wtime() - t_beg3;
1564 t_beg4 = GA_Wtime();
1565 GP_Assign_local_element(gt_a_data, &iblock, (void*)tblk_ptr, isize);
1566 t_gp_assign = t_gp_assign + GA_Wtime() - t_beg4;
1567 #if 1
1568 free(blk);
1569 #endif
1570 }
1571
1572 /* Clean up after transpose */
1573 #if 1
1574 free(iblk);
1575 free(iblk_t);
1576 free(jblk);
1577 free(jblk_t);
1578 free(blkidx);
1579 free(blkidx_t);
1580 #endif
1581 NGA_Sync();
1582 t_ga_trans = t_ga_trans + GA_Wtime() - t_beg2;
1583 #if USE_HYPRE
1584 alpha = 1.0;
1585 beta = 0.0;
1586 ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1587 hlo[0] = lo[0];
1588 hlo[1] = lo[1];
1589 hlo[2] = lo[2];
1590 hhi[0] = hi[0];
1591 hhi[1] = hi[1];
1592 hhi[2] = hi[2];
1593 ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1594 NGA_Distribution(g_c,me,hlo,hhi);
1595 cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
1596 NGA_Get(g_c,hlo,hhi,cbuf,&one);
1597 dothypre = 0.0;
1598 dotga = 0.0;
1599 prdot2 = 0.0;
1600 for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
1601 dothypre = dothypre + vector[i]*vector[i];
1602 dotga = dotga + cbuf[i]*cbuf[i];
1603 if (fabs(vector[i]-cbuf[i]) > 1.0e-10) {
1604 printf("p[%d] i: %d vector: %f cbuf: %f\n",me,i,vector[i],cbuf[i]);
1605 }
1606 prdot2 = prdot2 + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1607 }
1608 NGA_Dgop(&dotga,1,"+");
1609 NGA_Dgop(&dothypre,1,"+");
1610 NGA_Dgop(&prdot2,1,"+");
1611 prtot2 += sqrt(prdot2);
1612 gatot2 += sqrt(dotga);
1613 hypretot2 += sqrt(dothypre);
1614 free(cbuf);
1615 free(blk_ptr);
1616 #endif
1617 /* Clean up transposed matrix */
1618 GP_Distribution(gt_a_data,me,blo,bhi);
1619 for (i=blo[0]; i<bhi[0]; i++) {
1620 GP_Free(GP_Free_local_element(gt_a_data,&i));
1621 }
1622 GP_Destroy(gt_a_data);
1623 NGA_Destroy(gt_a_i);
1624 NGA_Destroy(gt_a_j);
1625 #endif
1626 #endif
1627 }
1628 free(vector);
1629 #if USE_HYPRE
1630 if (me == 0) {
1631 printf("Magnitude of GA solution: %e\n",
1632 gatot/((double)LOOPNUM));
1633 printf("Magnitude of HYPRE solution: %e\n",
1634 hypretot/((double)LOOPNUM));
1635 printf("Magnitude of GA solution(2): %e\n",
1636 gatot2/((double)LOOPNUM));
1637 printf("Magnitude of HYPRE solution(2): %e\n",
1638 hypretot2/((double)LOOPNUM));
1639 printf("Difference between GA and HYPRE (Struct) results: %e\n",
1640 prtot/((double)LOOPNUM));
1641 printf("Difference between transpose and HYPRE results: %e\n",
1642 prtot2/((double)LOOPNUM));
1643 }
1644 #endif
1645
1646 /*
1647 Clean up arrays
1648 */
1649 NGA_Destroy(g_b);
1650 NGA_Destroy(g_c);
1651 GP_Distribution(g_a_data,me,blo,bhi);
1652 for (i=blo[0]; i<bhi[0]; i++) {
1653 GP_Free(GP_Free_local_element(g_a_data,&i));
1654 }
1655 GP_Destroy(g_a_data);
1656 NGA_Destroy(g_a_i);
1657 NGA_Destroy(g_a_j);
1658 #if USE_HYPRE
1659 ierr = HYPRE_StructStencilDestroy(stencil);
1660 ierr = HYPRE_StructGridDestroy(grid);
1661 ierr = HYPRE_StructMatrixDestroy(matrix);
1662 ierr = HYPRE_StructVectorDestroy(vec_x);
1663 ierr = HYPRE_StructVectorDestroy(vec_y);
1664 #endif
1665
1666 NGA_Dgop(&t_cnstrct,1,"+");
1667 NGA_Dgop(&t_get,1,"+");
1668 NGA_Dgop(&t_gp_get,1,"+");
1669 NGA_Dgop(&t_mult,1,"+");
1670 NGA_Dgop(&t_ga_tot,1,"+");
1671 NGA_Dgop(&t_ga_trans,1,"+");
1672 NGA_Dgop(&t_get_blk_csr,1,"+");
1673 NGA_Dgop(&t_trans_blk_csr,1,"+");
1674 NGA_Dgop(&t_trans_blk,1,"+");
1675 NGA_Dgop(&t_create_csr_ga,1,"+");
1676 NGA_Dgop(&t_gp_tget,1,"+");
1677 NGA_Dgop(&t_gp_malloc,1,"+");
1678 NGA_Dgop(&t_gp_assign,1,"+");
1679 #if USE_HYPRE
1680 NGA_Dgop(&t_hypre_strct,1,"+");
1681 #endif
1682 free(mapc);
1683
1684 if (me == 0) {
1685 printf("Time to create sparse matrix: %12.4f\n",
1686 t_cnstrct/((double)(nprocs*LOOPNUM)));
1687 printf("Time to get right hand side vector: %12.4f\n",
1688 t_get/((double)(nprocs*LOOPNUM)));
1689 printf("Time to get GP blocks: %12.4f\n",
1690 t_gp_get/((double)(nprocs*LOOPNUM)));
1691 printf("Time for sparse matrix block multiplication: %12.4f\n",
1692 t_mult/((double)(nprocs*LOOPNUM)));
1693 printf("Time for total sparse matrix multiplication: %12.4f\n",
1694 t_ga_tot/((double)(nprocs*LOOPNUM)));
1695 #if USE_HYPRE
1696 printf("Total time for HYPRE (Struct) matrix-vector multiply:%12.4f\n",
1697 t_hypre_strct/((double)(nprocs*LOOPNUM)));
1698 #endif
1699 printf("Time to get block CSR distribution: %12.4f\n",
1700 t_get_blk_csr/((double)(nprocs*LOOPNUM)));
1701 printf("Time for transposing block CSR distribution: %12.4f\n",
1702 t_trans_blk_csr/((double)(nprocs*LOOPNUM)));
1703 printf("Time for creating transposed block CSR GA: %12.4f\n",
1704 t_create_csr_ga/((double)(nprocs*LOOPNUM)));
1705 printf("Time for transposing blocks: %12.4f\n",
1706 t_trans_blk/((double)(nprocs*LOOPNUM)));
1707 printf("Time to get GP blocks for transpose: %12.4f\n",
1708 t_gp_tget/((double)(nprocs*LOOPNUM)));
1709 printf("Time to malloc GP blocks for transpose: %12.4f\n",
1710 t_gp_malloc/((double)(nprocs*LOOPNUM)));
1711 printf("Time to assign GP blocks for transpose: %12.4f\n",
1712 t_gp_assign/((double)(nprocs*LOOPNUM)));
1713 printf("Time for total sparse matrix transpose: %12.4f\n",
1714 t_ga_trans/((double)(nprocs*LOOPNUM)));
1715 }
1716 if (me==0) {
1717 printf("Terminating GA library\n");
1718 }
1719 NGA_Terminate();
1720 /*
1721 *** Tidy up after message-passing library
1722 */
1723 ierr = MPI_Finalize();
1724 }
1725