1 #include <stdlib.h>
2 #include <math.h>
3 #include <stdio.h>
4
5 #include "macdecls.h"
6 #include "ga.h"
7 #include "mp3.h"
8
9 #define USE_HYPRE 0
10 #define IMAX 200
11 #define JMAX 200
12 #define KMAX 200
13 #define LMAX IMAX*JMAX*KMAX
14
15 #define bb_a(ib) bb_v(bb_i + (ib))
16 #define cc_a(ib) cc_v(cc_i + (ib))
17
18 #define Integer int
19 #define MAX_FACTOR 10000
20
21 /**
22 * If this test is built standalone and then linked to the Hypre library,
23 * the Hypre sparse matrix-vector multiply routines can be used to check the
24 * correctness of the answers
25 */
26 #if USE_HYPRE
27 #include "HYPRE.h"
28 #include "HYPRE_struct_mv.h"
29 #include "mpi.h"
30 #endif
31
grid_factor(int p,int xdim,int ydim,int zdim,int * idx,int * idy,int * idz)32 void grid_factor(int p, int xdim, int ydim, int zdim, int *idx, int *idy, int *idz) {
33 int i, j;
34 int ip, ifac, pmax, prime[MAX_FACTOR];
35 int fac[MAX_FACTOR];
36 int ix, iy, iz, ichk;
37
38 i = 1;
39 /**
40 * factor p completely
41 * first, find all prime numbers, besides 1, less than or equal to
42 * the square root of p
43 */
44 ip = (int)(sqrt((double)p))+1;
45 pmax = 0;
46 for (i=2; i<=ip; i++) {
47 ichk = 1;
48 for (j=0; j<pmax; j++) {
49 if (i%prime[j] == 0) {
50 ichk = 0;
51 break;
52 }
53 }
54 if (ichk) {
55 pmax = pmax + 1;
56 if (pmax > MAX_FACTOR) printf("Overflow in grid_factor\n");
57 prime[pmax-1] = i;
58 }
59 }
60 /**
61 * find all prime factors of p
62 */
63 ip = p;
64 ifac = 0;
65 for (i=0; i<pmax; i++) {
66 while(ip%prime[i] == 0) {
67 ifac = ifac + 1;
68 fac[ifac-1] = prime[i];
69 ip = ip/prime[i];
70 }
71 }
72 /**
73 * p is prime
74 */
75 if (ifac==0) {
76 ifac++;
77 fac[0] = p;
78 }
79 /**
80 * find three factors of p of approximately the
81 * same size
82 */
83 *idx = 1;
84 *idy = 1;
85 *idz = 1;
86 for (i = ifac-1; i >= 0; i--) {
87 ix = xdim/(*idx);
88 iy = ydim/(*idy);
89 iz = zdim/(*idz);
90 if (ix >= iy && ix >= iz && ix > 1) {
91 *idx = fac[i]*(*idx);
92 } else if (iy >= ix && iy >= iz && iy > 1) {
93 *idy = fac[i]*(*idy);
94 } else if (iz >= ix && iz >= iy && iz > 1) {
95 *idz = fac[i]*(*idz);
96 } else {
97 printf("Too many processors in grid factoring routine\n");
98 }
99 }
100 }
101
102 /**
103 * Short subroutine for multiplying sparse matrix block with vector segment
104 */
loc_matmul(double * a_mat,int * jvec,int * ivec,double * bvec,double * cvec,int nrows)105 void loc_matmul(double *a_mat, int *jvec, int *ivec,
106 double *bvec, double *cvec, int nrows) {
107 double tempc;
108 int i, j, jj, jmin,jmax;
109 for (i=0; i<nrows; i++) {
110 jmin = ivec[i];
111 jmax = ivec[i+1]-1;
112 tempc = 0.0;
113 for (j=jmin; j<jmax; j++) {
114 jj = jvec[j];
115 tempc = tempc + a_mat[j]*bvec[jj];
116 }
117 cvec[i] = cvec[i] + tempc;
118 }
119 }
120 /**
121 * Random number generator
122 */
ran3(int * idum)123 double ran3(int *idum) {
124 static int iff = 0;
125 double randnum;
126 if (idum < 0 || iff == 0) {
127 iff = 1;
128 srand((unsigned int)abs(*idum));
129 *idum = 1;
130 }
131 randnum = ((double)rand())/((double)RAND_MAX);
132 return randnum;
133 }
134
135 /**
136 * create a sparse matrix in compressed row form corresponding to a Laplacian
137 * differential operator using a 7-point stencil for a grid on a lattice of
138 * dimension idim x jdim x kdim grid points
139 */
create_laplace_mat(int idim,int jdim,int kdim,int pdi,int pdj,int pdk,int * g_data,int * g_i,int * g_j,int * total_procs,int ** proclist,int ** proc_inv,int ** icnt,int ** voffset,int ** nsize,int ** offset,int * tsize,int ** imapc)140 void create_laplace_mat(int idim, int jdim, int kdim, int pdi, int pdj, int pdk,
141 int *g_data, int *g_i, int *g_j, int *total_procs,
142 int **proclist, int **proc_inv, int **icnt, int **voffset,
143 int **nsize, int **offset, int *tsize, int **imapc)
144 {
145 /**
146 * idim: i-dimension of grid
147 * jdim: j-dimension of grid
148 * kdim: k-dimension of grid
149 * pdi: i-dimension of processor grid
150 * pdj: j-dimension of processor grid
151 * pdk: k-dimension of processor grid
152 * g_data: global array of values
153 * g_j: global array containing j indices (using local indices)
154 * g_i: global array containing starting location of each row in g_j
155 * (using local indices)
156 * total_procs: number of processors this proc interacts with
157 * proclist: list of processors that this process interacts with
158 * proc_inv: given a processor, map it to position in proclist
159 * icnt: number of elements in submatrix
160 * voffset: offset for each submatrix block
161 * nsize: number of elements in right hand side vector on each processor
162 * offset: starting index of right hand side vector on each processor
163 * tsize: total number of non-zero elements in matrix
164 * imapc: map array for vectors
165 */
166 int ltotal_procs, ltsize;
167 int *lproclist, *lproc_inv, *lvoffset, *lnsize, *loffset, *licnt, *limapc;
168 int nprocs, me, imin, imax, jcnt, jmin, jmax;
169 int ix, iy, iz, idx;
170 double x, dr;
171 double *rval;
172 int isize, idbg;
173 int *jval, *ival, *ivalt;
174 int i, j, k, itmp, one, tlo, thi, ld;
175 int idum, ntot, indx, nghbrs[7], ncnt;
176 int ixn[7],iyn[7],izn[7], procid[7];
177 int status;
178 int lo[3], hi[3], ip, jp, kp, ldi, ldj, jdx, joff;
179 int il, jl, kl, ldmi, ldpi, ldmj, ldpj;
180 int *xld, *yld, *zld, *tmapc;
181 int *ecnt, *total_distr;
182 int total_max;
183 FILE *fp, *fopen();
184
185 me = GA_Nodeid();
186 nprocs = GA_Nnodes();
187 idum = -(12345+me);
188 x = ran3(&idum);
189 one = 1;
190
191 if (me == 0) {
192 printf("\n Dimension of grid: \n\n");
193 printf(" I Dimension: %d\n",idim);
194 printf(" J Dimension: %d\n",jdim);
195 printf(" K Dimension: %d\n\n",kdim);
196 }
197 /**
198 * Find position of processor in processor grid and calulate minimum
199 * and maximum values of indices
200 */
201 i = me;
202 ip = i%pdi;
203 i = (i-ip)/pdi;
204 jp = i%pdj;
205 kp = (i-jp)/pdj;
206
207 lo[0] = (int)((((double)idim)*((double)ip))/((double)pdi));
208 if (ip < pdi-1) {
209 hi[0] = (int)((((double)idim)*((double)(ip+1)))/((double)pdi))-1;
210 } else {
211 hi[0] = idim - 1;
212 }
213
214 lo[1] = (int)((((double)jdim)*((double)jp))/((double)pdj));
215 if (jp < pdj-1) {
216 hi[1] = (int)((((double)jdim)*((double)(jp+1)))/((double)pdj))-1;
217 } else {
218 hi[1] = jdim - 1;
219 }
220
221 lo[2] = (int)((((double)kdim)*((double)kp))/((double)pdk));
222 if (kp < pdk-1) {
223 hi[2] = (int)((((double)kdim)*((double)(kp+1)))/((double)pdk))-1;
224 } else {
225 hi[2] = kdim - 1;
226 }
227
228 /**
229 * Determine stride lengths. Start with stride lengths for grid block
230 * owned by this processor
231 */
232 ldi = hi[0]-lo[0]+1;
233 ldj = hi[1]-lo[1]+1;
234
235 /**
236 * Find stride lengths for blocks owned by other processors
237 */
238 xld = (int*)malloc(pdi*sizeof(int));
239 for (i=0; i<pdi; i++) {
240 if (i<pdi-1) {
241 xld[i] = (int)((((double)idim)*((double)(i+1)))/((double)pdi));
242 } else {
243 xld[i] = idim;
244 }
245 xld[i] = xld[i] - (int)((((double)idim)*((double)(i)))/((double)pdi));
246 }
247
248 yld = (int*)malloc(pdj*sizeof(int));
249 for (i=0; i<pdj; i++) {
250 if (i<pdj-1) {
251 yld[i] = (int)((((double)jdim)*((double)(i+1)))/((double)pdj));
252 } else {
253 yld[i] = jdim;
254 }
255 yld[i] = yld[i] - (int)((((double)jdim)*((double)(i)))/((double)pdj));
256 }
257
258 zld = (int*)malloc(pdk*sizeof(int));
259 for (i=0; i<pdk; i++) {
260 if (i<pdk-1) {
261 zld[i] = (int)((((double)kdim)*((double)(i+1)))/((double)pdk));
262 } else {
263 zld[i] = jdim;
264 }
265 zld[i] = zld[i] - (int)((((double)kdim)*((double)(i)))/((double)pdk));
266 }
267
268 /**
269 * Determine number of rows per processor
270 */
271
272 lnsize = (int*)malloc(nprocs*sizeof(int));
273 *nsize = lnsize;
274 loffset = (int*)malloc(nprocs*sizeof(int));
275 *offset = loffset;
276 for (i=0; i<nprocs; i++) {
277 lnsize[i] = 0;
278 loffset[i] = 0;
279 }
280 lnsize[me] = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
281 GA_Igop(lnsize,nprocs,"+");
282 loffset[0] = 0;
283 for (i=1; i<nprocs; i++) {
284 loffset[i] = loffset[i-1] + lnsize[i-1];
285 }
286
287 ntot = idim*jdim*kdim;
288 NGA_Sync();
289 /**
290 * Scan over rows of lattice
291 */
292 imin = loffset[me];
293 imax = loffset[me]+lnsize[me]-1;
294 /**
295 * Find out how many other processors couple to this row of blocks. Start by
296 * initializing ecnt array to zero
297 */
298 ecnt = (int*)malloc(nprocs*sizeof(int));
299 for (i=0; i<nprocs; i++) {
300 ecnt[i] = 0;
301 }
302
303 /**
304 * Loop over all rows owned by this processor
305 */
306 for (i=imin; i<=imax; i++) {
307 /**
308 * Compute local indices of grid point corresponding to row i (this
309 * corresponds to an element of the grid)
310 */
311 indx = i - imin;
312 ix = indx%ldi;
313 indx = (indx - ix)/ldi;
314 iy = indx%ldj;
315 iz = (indx - iy)/ldj;
316 ix = ix + lo[0];
317 iy = iy + lo[1];
318 iz = iz + lo[2];
319
320 /**
321 * Check all neighbors. Mark the ecnt element corresponding to the processor
322 * that owns the element
323 */
324 ecnt[me] = ecnt[me] + 1;
325 if (ix+1 <= idim-1) {
326 if (ix+1 > hi[0]) {
327 jdx = kp*pdi*pdj + jp*pdi + ip + 1;
328 ecnt[jdx] = ecnt[jdx] + 1;
329 } else {
330 ecnt[me] = ecnt[me] + 1;
331 }
332 }
333 if (ix-1 >= 0) {
334 if (ix-1 < lo[0]) {
335 jdx = kp*pdi*pdj + jp*pdi + ip - 1;
336 ecnt[jdx] = ecnt[jdx] + 1;
337 } else {
338 ecnt[me] = ecnt[me] + 1;
339 }
340 }
341 if (iy+1 <= jdim-1) {
342 if (iy+1 > hi[1]) {
343 jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
344 ecnt[jdx] = ecnt[jdx] + 1;
345 } else {
346 ecnt[me] = ecnt[me] + 1;
347 }
348 }
349 if (iy-1 >= 0) {
350 if (iy-1 < lo[1]) {
351 jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
352 ecnt[jdx] = ecnt[jdx] + 1;
353 } else {
354 ecnt[me] = ecnt[me] + 1;
355 }
356 }
357 if (iz+1 <= kdim-1) {
358 if (iz+1 > hi[2]) {
359 jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
360 ecnt[jdx] = ecnt[jdx] + 1;
361 } else {
362 ecnt[me] = ecnt[me] + 1;
363 }
364 }
365 if (iz-1 >= 0) {
366 if (iz-1 < lo[2]) {
367 jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
368 ecnt[jdx] = ecnt[jdx] + 1;
369 } else {
370 ecnt[me] = ecnt[me] + 1;
371 }
372 }
373 }
374
375 /**
376 * Create a list of processors that this processor is coupled to. Also count
377 * the total number of grid points that this processor couples to. This number
378 * is equal to the total number of matrix elements held by this processor.
379 */
380 ltotal_procs = 0;
381 ncnt = 0;
382 for (i=0; i<nprocs; i++) {
383 if (ecnt[i] > 0) {
384 ltotal_procs += 1;
385 ncnt += ecnt[i];
386 }
387 }
388 *total_procs = ltotal_procs;
389 lproclist = (int*)malloc(ltotal_procs*sizeof(int));
390 *proclist = lproclist;
391 lproc_inv = (int*)malloc(nprocs*sizeof(int));
392 *proc_inv = lproc_inv;
393 licnt = (int*)malloc(ltotal_procs*sizeof(int));
394 *icnt = licnt;
395
396 /**
397 * Set up conventional CSR arrays to hold portion of matrix owned by this
398 * processor
399 */
400 rval = (double*)malloc(ncnt*sizeof(double));
401 idbg = ncnt;
402 jval = (int*)malloc(ncnt*sizeof(int));
403 ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
404 ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
405
406 for (i=0; i<ncnt; i++) {
407 rval[i] = 0.0;
408 jval[i] = 0;
409 }
410
411 j = (imax-imin+2)*ltotal_procs;
412 for (i=0; i<j; i++) {
413 ival[i] = 0;
414 ivalt[i] = 0;
415 }
416
417 j = 0;
418 for (i=0; i<nprocs; i++) {
419 lproc_inv[i] = -1;
420 }
421 /**
422 * voffset represents vertical partitions in the row block that divide it into
423 * blocks that couple to grid elements on other processors
424 */
425 lvoffset = (int*)malloc(ltotal_procs*sizeof(int));
426 *voffset = lvoffset;
427 lvoffset[0] = 0;
428 for (i=0; i<nprocs; i++) {
429 if (ecnt[i] > 0) {
430 lproclist[j] = i;
431 if (j > 0) {
432 lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1];
433 }
434 lproc_inv[i] = j;
435 j++;
436 }
437 }
438 free(ecnt);
439
440 isize = imax-imin+2;
441 for (i=0; i<ltotal_procs; i++) {
442 licnt[i] = 0;
443 }
444 ltsize = 0;
445 for (i=imin; i<=imax; i++) {
446 /**
447 * compute local indices of grid point corresponding to row i
448 */
449 indx = i - imin;
450 ix = indx%ldi;
451 indx = (indx - ix)/ldi;
452 iy = indx%ldj;
453 iz = (indx - iy)/ldj;
454 ix = ix + lo[0];
455 iy = iy + lo[1];
456 iz = iz + lo[2];
457 /**
458 * find locations of neighbors in 7-point stencil (if they are on the grid)
459 */
460 ncnt = 0;
461 ixn[ncnt] = ix;
462 iyn[ncnt] = iy;
463 izn[ncnt] = iz;
464 il = ix - lo[0];
465 jl = iy - lo[1];
466 kl = iz - lo[2];
467 idx = kl*ldi*ldj + jl*ldi + il;
468 nghbrs[ncnt] = idx;
469 procid[ncnt] = me;
470 if (ix+1 <= idim - 1) {
471 ncnt++;
472 ixn[ncnt] = ix + 1;
473 iyn[ncnt] = iy;
474 izn[ncnt] = iz;
475 if (ix+1 > hi[0]) {
476 jdx = kp*pdi*pdj + jp*pdi + ip + 1;
477 il = 0;
478 jl = iy - lo[1];
479 kl = iz - lo[2];
480 ldpi = xld[ip+1];
481 } else {
482 jdx = me;
483 il = ix - lo[0] + 1;
484 jl = iy - lo[1];
485 kl = iz - lo[2];
486 ldpi = ldi;
487 }
488 idx = kl*ldpi*ldj + jl*ldpi + il;
489 nghbrs[ncnt] = idx;
490 procid[ncnt] = jdx;
491 }
492 if (ix-1 >= 0) {
493 ncnt++;
494 ixn[ncnt] = ix - 1;
495 iyn[ncnt] = iy;
496 izn[ncnt] = iz;
497 if (ix-1 < lo[0]) {
498 jdx = kp*pdi*pdj + jp*pdi + ip - 1;
499 il = xld[ip-1] - 1;
500 jl = iy - lo[1];
501 kl = iz - lo[2];
502 ldmi = xld[ip-1];
503 } else {
504 jdx = me;
505 il = ix - lo[0] - 1;
506 jl = iy - lo[1];
507 kl = iz - lo[2];
508 ldmi = ldi;
509 }
510 idx = kl*ldmi*ldj + jl*ldmi + il;
511 nghbrs[ncnt] = idx;
512 procid[ncnt] = jdx;
513 }
514 if (iy+1 <= jdim-1) {
515 ncnt++;
516 ixn[ncnt] = ix;
517 iyn[ncnt] = iy + 1;
518 izn[ncnt] = iz;
519 if (iy+1 > hi[1]) {
520 jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
521 il = ix - lo[0];
522 jl = 0;
523 kl = iz - lo[2];
524 ldpj = yld[jp+1];
525 } else {
526 jdx = me;
527 il = ix - lo[0];
528 jl = iy - lo[1] + 1;
529 kl = iz - lo[2];
530 ldpj = ldj;
531 }
532 idx = kl*ldi*ldpj + jl*ldi + il;
533 nghbrs[ncnt] = idx;
534 procid[ncnt] = jdx;
535 }
536 if (iy-1 >= 0) {
537 ncnt++;
538 ixn[ncnt] = ix;
539 iyn[ncnt] = iy - 1;
540 izn[ncnt] = iz;
541 if (iy-1 < lo[1]) {
542 jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
543 il = ix - lo[0];
544 jl = yld[jp-1] - 1;
545 kl = iz - lo[2];
546 ldmj = yld[jp-1];
547 } else {
548 jdx = me;
549 il = ix - lo[0];
550 jl = iy - lo[1] - 1;
551 kl = iz - lo[2];
552 ldmj = ldj;
553 }
554 idx = kl*ldi*ldmj + jl*ldi + il;
555 nghbrs[ncnt] = idx;
556 procid[ncnt] = jdx;
557 }
558 if (iz+1 <= kdim-1) {
559 ncnt++;
560 ixn[ncnt] = ix;
561 iyn[ncnt] = iy;
562 izn[ncnt] = iz + 1;
563 if (iz+1 > hi[2]) {
564 jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
565 il = ix - lo[0];
566 jl = iy - lo[1];
567 kl = 0;
568 } else {
569 jdx = me;
570 il = ix - lo[0];
571 jl = iy - lo[1];
572 kl = iz - lo[2] + 1;
573 }
574 idx = kl*ldi*ldj + jl*ldi + il;
575 nghbrs[ncnt] = idx;
576 procid[ncnt] = jdx;
577 }
578 if (iz-1 >= 0) {
579 ncnt++;
580 ixn[ncnt] = ix;
581 iyn[ncnt] = iy;
582 izn[ncnt] = iz - 1;
583 if (iz-1 < lo[2]) {
584 jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
585 il = ix - lo[0];
586 jl = iy - lo[1];
587 kl = zld[kp-1] - 1;
588 } else {
589 jdx = me;
590 il = ix - lo[0];
591 jl = iy - lo[1];
592 kl = iz - lo[2] - 1;
593 }
594 idx = kl*ldi*ldj + jl*ldi + il;
595 nghbrs[ncnt] = idx;
596 procid[ncnt] = jdx;
597 }
598 /**
599 * sort j indices. This uses a simple bubble sort, but ncnt should be small. A
600 * more sophisticated approach could be taken if this is too time consuming.
601 */
602 ncnt++;
603 for (j=0; j<ncnt; j++) {
604 for (k=j+1; k<ncnt; k++) {
605 if (nghbrs[j] > nghbrs[k]) {
606 itmp = nghbrs[j];
607 nghbrs[j] = nghbrs[k];
608 nghbrs[k] = itmp;
609 itmp = ixn[j];
610 ixn[j] = ixn[k];
611 ixn[k] = itmp;
612 itmp = iyn[j];
613 iyn[j] = iyn[k];
614 iyn[k] = itmp;
615 itmp = izn[j];
616 izn[j] = izn[k];
617 izn[k] = itmp;
618 itmp = procid[j];
619 procid[j] = procid[k];
620 procid[k] = itmp;
621 }
622 }
623 }
624 for (k=0; k<ncnt; k++) {
625 if (nghbrs[k] < 0 || nghbrs[k] >= ntot) {
626 printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]);
627 }
628 }
629 /**
630 * create array elements
631 * for (j=0; j<ltotal_procs; j++) {
632 * printf("p[%d] lvoffset[%d]: %d\n",me,j,lvoffset[j]);
633 * }
634 */
635 for (j=0; j<ncnt; j++) {
636 jdx = procid[j];
637 idx = lproc_inv[jdx];
638 if (ix == ixn[j] && iy == iyn[j] && iz == izn[j]) {
639 rval[lvoffset[idx]+licnt[idx]] = 6.0;
640 } else {
641 rval[lvoffset[idx]+licnt[idx]] = -1.0;
642 }
643 if (lvoffset[idx]+licnt[idx]>=idbg) {
644 }
645 /* TODO: Check this carefully */
646 jval[lvoffset[idx]+licnt[idx]] = nghbrs[j];
647 ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1;
648 licnt[idx]++;
649 }
650 }
651 for (i=0; i<ltotal_procs; i++) {
652 ival[i*isize] = lvoffset[i];
653 for (j=1; j<isize; j++) {
654 ival[i*isize+j] = ival[i*isize+j-1] + ivalt[i*isize+j-1];
655 }
656 }
657 free(ivalt);
658 isize = 0;
659 for (i=0; i<ltotal_procs; i++) {
660 isize = isize + licnt[i];
661 }
662 ltsize = isize;
663 GA_Igop(<size,one,"+");
664 /**
665 * Local portion of sparse matrix has been evaluated and decomposed into blocks
666 * that match partitioning of right hand side across processors. The following
667 * data is available at this point:
668 * 1) total_procs: the number of processors that are coupled to this one via
669 * the sparse matrix
670 * 2) proc_list(total_procs): a list of processor IDs that are coupled to
671 * this processor
672 * 3) proc_inv(nprocs): The entry in proc_list that corresponds to a given
673 * processor. If the entry is zero then that processor does not couple to
674 * this processor.
675 * 4) icnt(total_procs): The number of non-zero entries in the sparse matrix
676 * that couple the process represented by proc_list(j) to this process
677 * 5) voffset(total_procs): The offsets for the non-zero data in the arrays
678 * rval and jval for the blocks that couple this processor to other
679 * processes in proc_list
680 * 6) offset(nprocs): the offset array for the distributed right hand side
681 * vector
682 * 7) nsize(nprocs): number of vector elements per processor for right hand
683 * side vector (equivalent to the number of rows per processor)
684 *
685 * These arrays describe how the sparse matrix is layed out both locally and
686 * across processors. In addition, the actual data for the distributed sparse
687 * matrix is found in the following arrays:
688 * 1) rval: values of matrix for all blocks on this processor
689 * 2) jval: j-indices of matrix for all blocks on this processor
690 * 3) ival(total_procs*(nsize(me)+1)): starting index in rval and jval for
691 * each row in each block
692 *
693 * create global arrays to hold sparse matrix
694 */
695 tmapc = (int*)malloc(nprocs*sizeof(int));
696 limapc = (int*)malloc(nprocs*sizeof(int));
697 for (i=0; i<nprocs; i++) {
698 tmapc[i] = 0;
699 }
700 for (i=0; i<ltotal_procs; i++) {
701 tmapc[me] = tmapc[me] + licnt[i];
702 }
703 GA_Igop(tmapc,nprocs,"+");
704 isize = 0;
705 for (i=0; i<nprocs; i++) {
706 isize += tmapc[i];
707 }
708 limapc[0] = 0;
709 for (i=1; i<nprocs; i++) {
710 limapc[i] = limapc[i-1] + tmapc[i-1];
711 }
712
713 *g_data = NGA_Create_handle();
714 NGA_Set_data(*g_data,one,&isize,C_DBL);
715 NGA_Set_irreg_distr(*g_data,limapc,&nprocs);
716 status = NGA_Allocate(*g_data);
717
718 *g_j = NGA_Create_handle();
719 NGA_Set_data(*g_j,one,&isize,C_INT);
720 NGA_Set_irreg_distr(*g_j,limapc,&nprocs);
721 status = NGA_Allocate(*g_j);
722
723 for (i=0; i<nprocs; i++) {
724 tmapc[i] = 0;
725 }
726 tmapc[me] = tmapc[me] + (lnsize[me]+1)*ltotal_procs;
727 GA_Igop(tmapc,nprocs,"+");
728 ntot = 0;
729 for (i=0; i<nprocs; i++) {
730 ntot += tmapc[i];
731 }
732 limapc[0] = 0;
733 for (i=1; i<nprocs; i++) {
734 limapc[i] = limapc[i-1] + tmapc[i-1];
735 }
736
737 *g_i = NGA_Create_handle();;
738 NGA_Set_data(*g_i,one,&ntot,C_INT);
739 NGA_Set_irreg_distr(*g_i,limapc,&nprocs);
740 status = NGA_Allocate(*g_i);
741
742 free(tmapc);
743
744 GA_Sync();
745 /**
746 * fill global arrays with local data
747 */
748 NGA_Distribution(*g_data,me,&tlo,&thi);
749 ld = thi-tlo+1;
750 NGA_Put(*g_data, &tlo, &thi, rval, &ld);
751 NGA_Put(*g_j, &tlo, &thi, jval, &ld);
752
753 NGA_Distribution(*g_i,me,&tlo,&thi);
754 ld = thi-tlo+1;
755 NGA_Put(*g_i, &tlo, &thi, ival, &ld);
756
757 GA_Sync();
758
759 free(rval);
760 free(jval);
761 free(ival);
762 free(xld);
763 free(yld);
764 free(zld);
765
766 limapc[0] = 0;
767 for (i=1; i<nprocs; i++) {
768 limapc[i] = limapc[i-1] + lnsize[i-1];
769 }
770 *imapc = limapc;
771 *tsize = isize;
772 return;
773 }
774
main(int argc,char ** argv)775 int main(int argc, char **argv) {
776 int nmax, nprocs, me;
777 int g_a_data, g_a_i, g_a_j, isize;
778 int g_b, g_c;
779 int i, j, jj, k, one, jcnt;
780 int chunk, kp1, ld;
781 int *p_i, *p_j;
782 double *p_data, *p_b, *p_c;
783 double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
784 double t_hypre_strct;
785 double prdot, dotga, dothypre, tempc;
786 double prtot, gatot, hypretot;
787 int status;
788 int idim, jdim, kdim, idum, memsize;
789 int jmin, jmax, lsize, ntot;
790 int heap=200000, fudge=100, stack=200000, ma_heap;
791 double *cbuf, *vector;
792 int pdi, pdj, pdk, ip, jp, kp, ncells;
793 int lo[3],hi[3];
794 int blo[3], bhi[3];
795 int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
796 int iproc, iblock, btot;
797 double *amat, *bvec;
798 int *ivec, *jvec;
799 int *proclist, *proc_inv, *icnt;
800 int *voffset, *nsize, *offset, *mapc;
801 int iloop;
802 int LOOPNUM = 100;
803 /**
804 * Hypre declarations. These are only used if linking to Hypre library for
805 * performance testing
806 */
807 int ierr;
808 #if USE_HYPRE
809 HYPRE_StructGrid grid;
810 HYPRE_StructStencil stencil;
811 HYPRE_StructMatrix matrix;
812 HYPRE_StructVector vec_x, vec_y;
813 int i4, j4, ndim, nelems, offsets[7][3];
814 int stencil_indices[7], hlo[3], hhi[3];
815 double weights[7];
816 double *values;
817 double alpha, beta;
818 int *rows, *cols;
819 #endif
820 /**
821 * Intitialize a message passing library
822 */
823 one = 1;
824 MP_INIT(argc,argv);
825 /**
826 * Initialize GA
827 *
828 * There are 2 choices: ga_initialize or ga_initialize_ltd.
829 * In the first case, there is no explicit limit on memory usage.
830 * In the second, user can set limit (per processor) in bytes.
831 */
832 t_beg = GA_Wtime();
833 GA_Initialize();
834 t_ga_in = GA_Wtime() - t_beg;
835 GA_Dgop(&t_ga_in,one,"+");
836 #if 0
837 memsize = 500000000
838 call ga_initialize_ltd(memsize)
839 memsize = 500000
840 status = ma_init(MT_DBL,memsize,memsize)
841 #endif
842
843 t_ga_tot = 0.0;
844 t_mult = 0.0;
845 t_get = 0.0;
846 t_hypre_strct = 0.0;
847 prtot = 0.0;
848 gatot = 0.0;
849 hypretot = 0.0;
850
851 me = GA_Nodeid();
852 nprocs = GA_Nnodes();
853 if (me == 0) {
854 printf("Time to initialize GA: %12.4f\n",
855 t_ga_in/((double)nprocs));
856 }
857 /**
858 * we can also use GA_set_memory_limit BEFORE first ga_create call
859 */
860 ma_heap = heap + fudge;
861 /* call GA_set_memory_limit(util_mdtob(ma_heap)) */
862
863 if (me == 0) {
864 printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
865 }
866 /**
867 * Initialize the MA package
868 * MA must be initialized before any global array is allocated
869 */
870 if (!MA_init(MT_DBL, stack, ma_heap)) GA_Error("ma_init failed",-1);
871 /**
872 * create a sparse LMAX x LMAX matrix and two vectors of length
873 * LMAX. The matrix is stored in compressed row format.
874 * One of the vectors is filled with random data and the other
875 * is filled with zeros.
876 */
877 idim = IMAX;
878 jdim = JMAX;
879 kdim = KMAX;
880 ntot = idim*jdim*kdim;
881 if (me == 0) {
882 printf("\nDimension of matrix: %d\n\n",ntot);
883 }
884 t_beg = GA_Wtime();
885 grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
886 if (me == 0) {
887 printf("\nProcessor grid configuration\n");
888 printf(" PDX: %d\n",pdi);
889 printf(" PDY: %d\n",pdj);
890 printf(" PDZ: %d\n\n",pdk);
891 }
892
893 create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,
894 &g_a_data,&g_a_i,&g_a_j,&total_procs,&proclist,&proc_inv,
895 &icnt,&voffset,&nsize,&offset,&isize,&mapc);
896 t_cnstrct = GA_Wtime() - t_beg;
897 if (me == 0) {
898 printf("\nNumber of non-zero elements in compressed matrix: %d\n",isize);
899 }
900
901 g_b = GA_Create_handle();
902 GA_Set_data(g_b,one,&ntot,C_DBL);
903 GA_Set_irreg_distr(g_b,mapc,&nprocs);
904 status = GA_Allocate(g_b);
905 /**
906 * fill g_b with random values
907 */
908 NGA_Distribution(g_b,me,blo,bhi);
909 NGA_Access(g_b,blo,bhi,&p_b,&ld);
910 ld = bhi[0]-blo[0]+1;
911 btot = ld;
912 vector = (double*)malloc(ld*sizeof(double));
913 for (i=0; i<ld; i++) {
914 idum = 0;
915 p_b[i] = ran3(&idum);
916 vector[i] = p_b[i];
917 }
918 NGA_Release(g_b,blo,bhi);
919 GA_Sync();
920
921 g_c = GA_Create_handle();
922 NGA_Set_data(g_c,one,&ntot,C_DBL);
923 NGA_Set_irreg_distr(g_c,mapc,&nprocs);
924 status = GA_Allocate(g_c);
925 NGA_Zero(g_c);
926 #if USE_HYPRE
927 /**
928 * Assemble HYPRE grid and use that to create matrix. Start by creating
929 * grid partition
930 */
931 ndim = 3;
932 i = me;
933 ip = i%pdi;
934 i = (i-ip)/pdi;
935 jp = i%pdj;
936 kp = (i-jp)/pdj;
937 lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
938 if (ip < pdi-1) {
939 hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
940 } else {
941 hi[0] = idim - 1;
942 }
943 lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
944 if (jp < pdj-1) {
945 hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
946 } else {
947 hi[1] = jdim - 1;
948 }
949 lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
950 if (kp < pdk-1) {
951 hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
952 } else {
953 hi[2] = kdim - 1;
954 }
955 /**
956 * Create grid
957 */
958 hlo[0] = lo[0];
959 hlo[1] = lo[1];
960 hlo[2] = lo[2];
961 hhi[0] = hi[0];
962 hhi[1] = hi[1];
963 hhi[2] = hi[2];
964 ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
965 ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
966 ierr = HYPRE_StructGridAssemble(grid);
967 /**
968 * Create stencil
969 */
970 offsets[0][0] = 0;
971 offsets[0][1] = 0;
972 offsets[0][2] = 0;
973
974 offsets[1][0] = 1;
975 offsets[1][1] = 0;
976 offsets[1][2] = 0;
977
978 offsets[2][0] = 0;
979 offsets[2][1] = 1;
980 offsets[2][2] = 0;
981
982 offsets[3][0] = 0;
983 offsets[3][1] = 0;
984 offsets[3][2] = 1;
985
986 offsets[4][0] = -1;
987 offsets[4][1] = 0;
988 offsets[4][2] = 0;
989
990 offsets[5][0] = 0;
991 offsets[5][1] = -1;
992 offsets[5][2] = 0;
993
994 offsets[6][0] = 0;
995 offsets[6][1] = 0;
996 offsets[6][2] = -1;
997
998 nelems = 7;
999 ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
1000 for (i=0; i<nelems; i++) {
1001 ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
1002 }
1003
1004 ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1005 jcnt = 7*ncells;
1006 values = (double*)malloc(jcnt*sizeof(double));
1007 jcnt = 0;
1008 weights[0] = 6.0;
1009 weights[1] = -1.0;
1010 weights[2] = -1.0;
1011 weights[3] = -1.0;
1012 weights[4] = -1.0;
1013 weights[5] = -1.0;
1014 weights[6] = -1.0;
1015 for (i=0; i<ncells; i++) {
1016 for (j=0; j<7; j++) {
1017 values[jcnt] = weights[j];
1018 jcnt++;
1019 }
1020 }
1021
1022 ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
1023 ierr = HYPRE_StructMatrixInitialize(matrix);
1024 for (i=0; i<7; i++) {
1025 stencil_indices[i] = i;
1026 }
1027 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
1028 free(values);
1029 /**
1030 * Check all six sides of current box to see if any are boundaries.
1031 * Set values to zero if they are.
1032 */
1033 if (hi[0] == idim-1) {
1034 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1035 hlo[0] = idim-1;
1036 hhi[0] = idim-1;
1037 hlo[1] = lo[1];
1038 hhi[1] = hi[1];
1039 hlo[2] = lo[2];
1040 hhi[2] = hi[2];
1041 values = (double*)malloc(ncells*sizeof(double));
1042 for (i=0; i<ncells; i++) values[i] = 0.0;
1043 i4 = 1;
1044 j4 = 1;
1045 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1046 free(values);
1047 }
1048 if (hi[1] == jdim-1) {
1049 ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1050 hlo[0] = lo[0];
1051 hhi[0] = hi[0];
1052 hlo[1] = jdim-1;
1053 hhi[1] = jdim-1;
1054 hlo[2] = lo[2];
1055 hhi[2] = hi[2];
1056 values = (double*)malloc(ncells*sizeof(double));
1057 for (i=0; i<ncells; i++) values[i] = 0.0;
1058 i4 = 1;
1059 j4 = 2;
1060 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1061 free(values);
1062 }
1063 if (hi[2] == kdim-1) {
1064 ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
1065 hlo[0] = lo[0];
1066 hhi[0] = hi[0];
1067 hlo[1] = lo[1];
1068 hhi[1] = hi[1];
1069 hlo[2] = kdim-1;
1070 hhi[2] = kdim-1;
1071 values = (double*)malloc(ncells*sizeof(double));
1072 for (i=0; i<ncells; i++) values[i] = 0.0;
1073 i4 = 1;
1074 j4 = 3;
1075 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1076 free(values);
1077 }
1078 if (lo[0] == 0) {
1079 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1080 hlo[0] = 0;
1081 hhi[0] = 0;
1082 hlo[1] = lo[1];
1083 hhi[1] = hi[1];
1084 hlo[2] = lo[2];
1085 hhi[2] = hi[2];
1086 values = (double*)malloc(ncells*sizeof(double));
1087 for (i=0; i<ncells; i++) values[i] = 0.0;
1088 i4 = 1;
1089 j4 = 4;
1090 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1091 free(values);
1092 }
1093 if (lo[1] == 0) {
1094 ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1095 hlo[0] = lo[0];
1096 hhi[0] = hi[0];
1097 hlo[1] = 0;
1098 hhi[1] = 0;
1099 hlo[2] = lo[2];
1100 hhi[2] = hi[2];
1101 values = (double*)malloc(ncells*sizeof(double));
1102 for (i=0; i<ncells; i++) values[i] = 0.0;
1103 i4 = 1;
1104 j4 = 5;
1105 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1106 free(values);
1107 }
1108 if (lo[2] == 1) {
1109 ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1110 hlo[0] = lo[0];
1111 hhi[0] = hi[0];
1112 hlo[1] = lo[1];
1113 hhi[1] = hi[1];
1114 hlo[2] = 0;
1115 hhi[2] = 0;
1116 values = (double*)malloc(ncells*sizeof(double));
1117 for (i=0; i<ncells; i++) values[i] = 0.0;
1118 i4 = 1;
1119 j4 = 6;
1120 ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1121 free(values);
1122 }
1123 ierr = HYPRE_StructMatrixAssemble(matrix);
1124 /**
1125 * Create vectors for matrix-vector multiply
1126 */
1127 ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
1128 ierr = HYPRE_StructVectorInitialize(vec_x);
1129 hlo[0] = lo[0];
1130 hlo[1] = lo[1];
1131 hlo[2] = lo[2];
1132 hhi[0] = hi[0];
1133 hhi[1] = hi[1];
1134 hhi[2] = hi[2];
1135 ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
1136 ierr = HYPRE_StructVectorAssemble(vec_x);
1137 NGA_Distribution(g_a_i,me,blo,bhi);
1138
1139 if (bhi[1] > ntot-1) {
1140 bhi[1] = ntot-1;
1141 }
1142
1143 btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1144
1145 for (i=0; i<btot; i++) vector[i] = 0.0;
1146 hlo[0] = lo[0];
1147 hlo[1] = lo[1];
1148 hlo[2] = lo[2];
1149 hhi[0] = hi[0];
1150 hhi[1] = hi[1];
1151 hhi[2] = hi[2];
1152 ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);
1153
1154 for (i=0; i<btot; i++) vector[i] = 0.0;
1155 ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
1156 ierr = HYPRE_StructVectorInitialize(vec_y);
1157 ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
1158 ierr = HYPRE_StructVectorAssemble(vec_y);
1159 #endif
1160 /**
1161 * Multiply sparse matrix. Start by accessing pointers to local portions of
1162 * g_a_data, g_a_j, g_a_i
1163 */
1164 for (iloop=0; iloop<LOOPNUM; iloop++) {
1165 t_beg2 = GA_Wtime();
1166 NGA_Distribution(g_a_data,me,blo,bhi);
1167 NGA_Access(g_a_data,blo,bhi,&p_data,&ld_a);
1168
1169 NGA_Distribution(g_a_j,me,blo,bhi);
1170 NGA_Access(g_a_j,blo,bhi,&p_j,&ld_j);
1171
1172 NGA_Distribution(g_a_i,me,blo,bhi);
1173 NGA_Access(g_a_i,blo,bhi,&p_i,&ld_i);
1174
1175 NGA_Distribution(g_c,me,blo,bhi);
1176 NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
1177 for (i = 0; i<bhi[0]-blo[0]+1; i++) {
1178 p_c[i] = 0.0;
1179 }
1180 /**
1181 * Loop through matrix blocks
1182 */
1183 ioff = 0;
1184 for (iblock = 0; iblock<total_procs; iblock++) {
1185 iproc = proclist[iblock];
1186 NGA_Distribution(g_b,iproc,blo,bhi);
1187 bvec = (double*)malloc((bhi[0]-blo[0]+1)*sizeof(double));
1188 j = 0;
1189 t_beg = GA_Wtime();
1190 NGA_Get(g_b,blo,bhi,bvec,&j);
1191 t_get = t_get + GA_Wtime() - t_beg;
1192 t_beg = GA_Wtime();
1193 irows = nsize[me]-1;
1194 for (i=0; i<=irows; i++) {
1195 jmin = p_i[ioff+i];
1196 jmax = p_i[ioff+i+1]-1;
1197 tempc = 0.0;
1198 for (j = jmin; j<=jmax; j++) {
1199 jj = p_j[j];
1200 tempc = tempc + p_data[j]*bvec[jj];
1201 }
1202 p_c[i] = p_c[i] + tempc;
1203 }
1204 ioff = ioff + nsize[me] + 1;
1205 t_mult = t_mult + GA_Wtime() - t_beg;
1206 free(bvec);
1207 }
1208 t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;
1209
1210 NGA_Distribution(g_a_data,me,blo,bhi);
1211 NGA_Release(g_a_data,blo,bhi);
1212 NGA_Distribution(g_a_j,me,blo,bhi);
1213 NGA_Release(g_a_j,blo,bhi);
1214 NGA_Distribution(g_a_i,me,blo,bhi);
1215 NGA_Release(g_a_i,blo,bhi);
1216 NGA_Distribution(g_c,me,blo,bhi);
1217 NGA_Release(g_c,blo,bhi);
1218
1219 #if USE_HYPRE
1220 alpha = 1.0;
1221 beta = 0.0;
1222 t_beg = GA_Wtime();
1223 ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1224 t_hypre_strct = GA_Wtime() - t_beg;
1225 hlo[0] = lo[0];
1226 hlo[1] = lo[1];
1227 hlo[2] = lo[2];
1228 hhi[0] = hi[0];
1229 hhi[1] = hi[1];
1230 hhi[2] = hi[2];
1231 ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1232 NGA_Distribution(g_c,me,lo,hi);
1233 cbuf = (double*)malloc((hi[0]-lo[0]+1)*sizeof(double));
1234 NGA_Get(g_c,lo,hi,cbuf,&one);
1235 prdot = 0.0;
1236 dotga = 0.0;
1237 dothypre = 0.0;
1238 for (i=0; i<(hi[0]-lo[0]+1); i++) {
1239 dothypre = dothypre + vector[i]*vector[i];
1240 dotga = dotga + cbuf[i]*cbuf[i];
1241 prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1242 }
1243 NGA_Dgop(&dotga,1,"+");
1244 NGA_Dgop(&dothypre,1,"+");
1245 NGA_Dgop(&prdot,1,"+");
1246 gatot += sqrt(dotga);
1247 hypretot += sqrt(dothypre);
1248 prtot += sqrt(prdot);
1249 free(cbuf);
1250 #endif
1251 }
1252 #if USE_HYPRE
1253 if (me == 0) {
1254 printf("Magnitude of GA solution: %e\n",
1255 gatot/((double)LOOPNUM));
1256 printf("Magnitude of HYPRE solution: %e\n",
1257 hyprtot/((double)LOOPNUM));
1258 printf("Difference between GA and HYPRE (Struct) results: %e\n",
1259 prtot/((double)LOOPNUM));
1260 }
1261 #endif
1262
1263 free(vector);
1264 /**
1265 * Clean up arrays
1266 */
1267 NGA_Destroy(g_b);
1268 NGA_Destroy(g_c);
1269 NGA_Destroy(g_a_data);
1270 NGA_Destroy(g_a_i);
1271 NGA_Destroy(g_a_j);
1272 #if USE_HYPRE
1273 #if USE_STRUCT
1274 ierr = HYPRE_StructStencilDestroy(stencil);
1275 ierr = HYPRE_StructGridDestroy(grid);
1276 ierr = HYPRE_StructMatrixDestroy(matrix);
1277 ierr = HYPRE_StructVectorDestroy(vec_x);
1278 ierr = HYPRE_StructVectorDestroy(vec_y);
1279 #endif
1280 #endif
1281
1282 NGA_Dgop(&t_cnstrct,1,"+");
1283 NGA_Dgop(&t_get,1,"+");
1284 NGA_Dgop(&t_mult,1,"+");
1285 NGA_Dgop(&t_ga_tot,1,"+");
1286 #if USE_HYPRE
1287 NGA_Dgop(&t_hypre_strct,1,"+");
1288 #endif
1289 free(proclist);
1290 free(proc_inv);
1291 free(icnt);
1292 free(voffset);
1293 free(nsize);
1294 free(offset);
1295 free(mapc);
1296
1297 if (me == 0) {
1298 printf("Time to create sparse matrix: %12.4f\n",
1299 t_cnstrct/((double)nprocs));
1300 printf("Time to get right hand side vector: %12.4f\n",
1301 t_get/((double)nprocs));
1302 printf("Time for sparse matrix block multiplication: %12.4f\n",
1303 t_mult/((double)nprocs));
1304 printf("Time for total sparse matrix multiplication: %12.4f\n",
1305 t_ga_tot/((double)nprocs));
1306 #if USE_HYPRE
1307 #if USE_STRUCT
1308 printf("Total time for HYPRE (Struct) matrix-vector multiply:%12.4f\n",
1309 t_hypre_strct/((double)nprocs));
1310 #endif
1311 #endif
1312 }
1313 NGA_Terminate();
1314 /**
1315 * Tidy up after message-passing library
1316 */
1317 MP_FINALIZE();
1318 }
1319