1 #include <stdlib.h>
2 #include <math.h>
3 #include <stdio.h>
4 
5 #include "macdecls.h"
6 #include "ga.h"
7 #include "mp3.h"
8 
9 #define USE_HYPRE 0
10 #define IMAX 200
11 #define JMAX 200
12 #define KMAX 200
13 #define LMAX IMAX*JMAX*KMAX
14 
15 #define bb_a(ib) bb_v(bb_i + (ib))
16 #define cc_a(ib) cc_v(cc_i + (ib))
17 
18 #define Integer int
19 #define MAX_FACTOR 10000
20 
21 /**
22  * If this test is built standalone and then linked to the Hypre library,
23  * the Hypre sparse matrix-vector multiply routines can be used to check the
24  * correctness of the answers
25  */
26 #if USE_HYPRE
27 #include "HYPRE.h"
28 #include "HYPRE_struct_mv.h"
29 #include "mpi.h"
30 #endif
31 
grid_factor(int p,int xdim,int ydim,int zdim,int * idx,int * idy,int * idz)32 void grid_factor(int p, int xdim, int ydim, int zdim, int *idx, int *idy, int *idz) {
33   int i, j;
34   int ip, ifac, pmax, prime[MAX_FACTOR];
35   int fac[MAX_FACTOR];
36   int ix, iy, iz, ichk;
37 
38   i = 1;
39 /**
40  *   factor p completely
41  *   first, find all prime numbers, besides 1, less than or equal to
42  *   the square root of p
43  */
44   ip = (int)(sqrt((double)p))+1;
45   pmax = 0;
46   for (i=2; i<=ip; i++) {
47     ichk = 1;
48     for (j=0; j<pmax; j++) {
49       if (i%prime[j] == 0) {
50         ichk = 0;
51         break;
52       }
53     }
54     if (ichk) {
55       pmax = pmax + 1;
56       if (pmax > MAX_FACTOR) printf("Overflow in grid_factor\n");
57       prime[pmax-1] = i;
58     }
59   }
60 /**
61  *   find all prime factors of p
62  */
63   ip = p;
64   ifac = 0;
65   for (i=0; i<pmax; i++) {
66     while(ip%prime[i] == 0) {
67       ifac = ifac + 1;
68       fac[ifac-1] = prime[i];
69       ip = ip/prime[i];
70     }
71   }
72 /**
73  *  p is prime
74  */
75   if (ifac==0) {
76     ifac++;
77     fac[0] = p;
78   }
79 /**
80  *    find three factors of p of approximately the
81  *    same size
82  */
83   *idx = 1;
84   *idy = 1;
85   *idz = 1;
86   for (i = ifac-1; i >= 0; i--) {
87     ix = xdim/(*idx);
88     iy = ydim/(*idy);
89     iz = zdim/(*idz);
90     if (ix >= iy && ix >= iz && ix > 1) {
91       *idx = fac[i]*(*idx);
92     } else if (iy >= ix && iy >= iz && iy > 1) {
93       *idy = fac[i]*(*idy);
94     } else if (iz >= ix && iz >= iy && iz > 1) {
95       *idz = fac[i]*(*idz);
96     } else {
97       printf("Too many processors in grid factoring routine\n");
98     }
99   }
100 }
101 
102 /**
103  *  Short subroutine for multiplying sparse matrix block with vector segment
104  */
loc_matmul(double * a_mat,int * jvec,int * ivec,double * bvec,double * cvec,int nrows)105 void loc_matmul(double *a_mat, int *jvec, int *ivec,
106                 double *bvec, double *cvec, int nrows) {
107   double tempc;
108   int i, j, jj, jmin,jmax;
109   for (i=0; i<nrows; i++) {
110     jmin = ivec[i];
111     jmax = ivec[i+1]-1;
112     tempc = 0.0;
113     for (j=jmin; j<jmax; j++) {
114       jj = jvec[j];
115       tempc = tempc + a_mat[j]*bvec[jj];
116     }
117     cvec[i] = cvec[i] + tempc;
118   }
119 }
120 /**
121  *   Random number generator
122  */
ran3(int * idum)123 double ran3(int *idum) {
124   static int iff = 0;
125   double randnum;
126   if (idum < 0 || iff == 0) {
127     iff = 1;
128     srand((unsigned int)abs(*idum));
129     *idum = 1;
130   }
131   randnum = ((double)rand())/((double)RAND_MAX);
132   return randnum;
133 }
134 
135 /**
136  *  create a sparse matrix in compressed row form corresponding to a Laplacian
137  *  differential operator using a 7-point stencil for a grid on a lattice of
138  *  dimension idim x jdim x kdim grid points
139  */
create_laplace_mat(int idim,int jdim,int kdim,int pdi,int pdj,int pdk,int * g_data,int * g_i,int * g_j,int * total_procs,int ** proclist,int ** proc_inv,int ** icnt,int ** voffset,int ** nsize,int ** offset,int * tsize,int ** imapc)140 void create_laplace_mat(int idim, int jdim, int kdim, int pdi, int pdj, int pdk,
141                         int *g_data, int *g_i, int *g_j, int *total_procs,
142                         int **proclist, int **proc_inv, int **icnt, int **voffset,
143                         int **nsize, int **offset, int *tsize, int **imapc)
144 {
145 /**
146  *  idim: i-dimension of grid
147  *  jdim: j-dimension of grid
148  *  kdim: k-dimension of grid
149  *  pdi: i-dimension of processor grid
150  *  pdj: j-dimension of processor grid
151  *  pdk: k-dimension of processor grid
152  *  g_data: global array of values
153  *  g_j: global array containing j indices (using local indices)
154  *  g_i: global array containing starting location of each row in g_j
155  *       (using local indices)
156  *  total_procs: number of processors this proc interacts with
157  *  proclist: list of processors that this process interacts with
158  *  proc_inv: given a processor, map it to position in proclist
159  *  icnt: number of elements in submatrix
160  *  voffset: offset for each submatrix block
161  *  nsize: number of elements in right hand side vector on each processor
162  *  offset: starting index of right hand side vector on each processor
163  *  tsize: total number of non-zero elements in matrix
164  *  imapc: map array for vectors
165  */
166   int ltotal_procs, ltsize;
167   int *lproclist, *lproc_inv,  *lvoffset, *lnsize, *loffset, *licnt, *limapc;
168   int nprocs, me, imin, imax, jcnt, jmin, jmax;
169   int ix, iy, iz, idx;
170   double x, dr;
171   double *rval;
172   int isize, idbg;
173   int *jval,  *ival,  *ivalt;
174   int i, j, k, itmp, one, tlo, thi, ld;
175   int idum, ntot, indx, nghbrs[7], ncnt;
176   int ixn[7],iyn[7],izn[7], procid[7];
177   int status;
178   int lo[3], hi[3], ip, jp, kp, ldi, ldj, jdx, joff;
179   int il, jl, kl, ldmi, ldpi, ldmj, ldpj;
180   int *xld, *yld, *zld, *tmapc;
181   int *ecnt, *total_distr;
182   int total_max;
183   FILE *fp, *fopen();
184 
185   me = GA_Nodeid();
186   nprocs = GA_Nnodes();
187   idum = -(12345+me);
188   x = ran3(&idum);
189   one = 1;
190 
191   if (me == 0) {
192     printf("\n Dimension of grid: \n\n");
193     printf(" I Dimension: %d\n",idim);
194     printf(" J Dimension: %d\n",jdim);
195     printf(" K Dimension: %d\n\n",kdim);
196   }
197 /**
198  * Find position of processor in processor grid and calulate minimum
199  * and maximum values of indices
200  */
201   i = me;
202   ip = i%pdi;
203   i = (i-ip)/pdi;
204   jp = i%pdj;
205   kp = (i-jp)/pdj;
206 
207   lo[0] = (int)((((double)idim)*((double)ip))/((double)pdi));
208   if (ip < pdi-1) {
209     hi[0] = (int)((((double)idim)*((double)(ip+1)))/((double)pdi))-1;
210   } else {
211     hi[0] = idim - 1;
212   }
213 
214   lo[1] = (int)((((double)jdim)*((double)jp))/((double)pdj));
215   if (jp < pdj-1) {
216     hi[1] = (int)((((double)jdim)*((double)(jp+1)))/((double)pdj))-1;
217   } else {
218     hi[1] = jdim - 1;
219   }
220 
221   lo[2] = (int)((((double)kdim)*((double)kp))/((double)pdk));
222   if (kp < pdk-1) {
223     hi[2] = (int)((((double)kdim)*((double)(kp+1)))/((double)pdk))-1;
224   } else {
225     hi[2] = kdim - 1;
226   }
227 
228 /**
229  * Determine stride lengths. Start with stride lengths for grid block
230  * owned by this processor
231  */
232   ldi = hi[0]-lo[0]+1;
233   ldj = hi[1]-lo[1]+1;
234 
235 /**
236  * Find stride lengths for blocks owned by other processors
237  */
238   xld = (int*)malloc(pdi*sizeof(int));
239   for (i=0; i<pdi; i++) {
240     if (i<pdi-1) {
241       xld[i] = (int)((((double)idim)*((double)(i+1)))/((double)pdi));
242     } else {
243       xld[i] = idim;
244     }
245     xld[i] = xld[i] - (int)((((double)idim)*((double)(i)))/((double)pdi));
246   }
247 
248   yld = (int*)malloc(pdj*sizeof(int));
249   for (i=0; i<pdj; i++) {
250     if (i<pdj-1) {
251       yld[i] = (int)((((double)jdim)*((double)(i+1)))/((double)pdj));
252     } else {
253       yld[i] = jdim;
254     }
255     yld[i] = yld[i] - (int)((((double)jdim)*((double)(i)))/((double)pdj));
256   }
257 
258   zld = (int*)malloc(pdk*sizeof(int));
259   for (i=0; i<pdk; i++) {
260     if (i<pdk-1) {
261       zld[i] = (int)((((double)kdim)*((double)(i+1)))/((double)pdk));
262     } else {
263       zld[i] = jdim;
264     }
265     zld[i] = zld[i] - (int)((((double)kdim)*((double)(i)))/((double)pdk));
266   }
267 
268 /**
269  *   Determine number of rows per processor
270  */
271 
272   lnsize = (int*)malloc(nprocs*sizeof(int));
273   *nsize = lnsize;
274   loffset = (int*)malloc(nprocs*sizeof(int));
275   *offset = loffset;
276   for (i=0; i<nprocs; i++) {
277     lnsize[i] = 0;
278     loffset[i] = 0;
279   }
280   lnsize[me] = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
281   GA_Igop(lnsize,nprocs,"+");
282   loffset[0] = 0;
283   for (i=1; i<nprocs; i++) {
284     loffset[i] = loffset[i-1] + lnsize[i-1];
285   }
286 
287   ntot = idim*jdim*kdim;
288   NGA_Sync();
289 /**
290  *   Scan over rows of lattice
291  */
292   imin = loffset[me];
293   imax = loffset[me]+lnsize[me]-1;
294 /**
295  *   Find out how many other processors couple to this row of blocks. Start by
296  *   initializing ecnt array to zero
297  */
298   ecnt = (int*)malloc(nprocs*sizeof(int));
299   for (i=0; i<nprocs; i++) {
300     ecnt[i] = 0;
301   }
302 
303 /**
304  * Loop over all rows owned by this processor
305  */
306   for (i=imin; i<=imax; i++) {
307 /**
308  *   Compute local indices of grid point corresponding to row i (this
309  *   corresponds to an element of the grid)
310  */
311     indx = i - imin;
312     ix = indx%ldi;
313     indx = (indx - ix)/ldi;
314     iy = indx%ldj;
315     iz = (indx - iy)/ldj;
316     ix = ix + lo[0];
317     iy = iy + lo[1];
318     iz = iz + lo[2];
319 
320 /**
321  * Check all neighbors. Mark the ecnt element corresponding to the processor
322  * that owns the element
323  */
324     ecnt[me] = ecnt[me] + 1;
325     if (ix+1 <= idim-1) {
326       if (ix+1 > hi[0]) {
327         jdx = kp*pdi*pdj + jp*pdi + ip + 1;
328         ecnt[jdx] = ecnt[jdx] + 1;
329       } else {
330         ecnt[me] = ecnt[me] + 1;
331       }
332     }
333     if (ix-1 >= 0) {
334       if (ix-1 < lo[0]) {
335         jdx = kp*pdi*pdj + jp*pdi + ip - 1;
336         ecnt[jdx] = ecnt[jdx] + 1;
337       } else {
338         ecnt[me] = ecnt[me] + 1;
339       }
340     }
341     if (iy+1 <= jdim-1) {
342       if (iy+1 > hi[1]) {
343         jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
344         ecnt[jdx] = ecnt[jdx] + 1;
345       } else {
346         ecnt[me] = ecnt[me] + 1;
347       }
348     }
349     if (iy-1 >= 0) {
350       if (iy-1 < lo[1]) {
351         jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
352         ecnt[jdx] = ecnt[jdx] + 1;
353       } else {
354         ecnt[me] = ecnt[me] + 1;
355       }
356     }
357     if (iz+1 <= kdim-1) {
358       if (iz+1 > hi[2]) {
359         jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
360         ecnt[jdx] = ecnt[jdx] + 1;
361       } else {
362         ecnt[me] = ecnt[me] + 1;
363       }
364     }
365     if (iz-1 >= 0) {
366       if (iz-1 < lo[2]) {
367         jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
368         ecnt[jdx] = ecnt[jdx] + 1;
369       } else {
370         ecnt[me] = ecnt[me] + 1;
371       }
372     }
373   }
374 
375 /**
376  *  Create a list of processors that this processor is coupled to. Also count
377  *  the total number of grid points that this processor couples to. This number
378  *  is equal to the total number of matrix elements held by this processor.
379  */
380   ltotal_procs = 0;
381   ncnt = 0;
382   for (i=0; i<nprocs; i++) {
383     if (ecnt[i] > 0) {
384       ltotal_procs += 1;
385       ncnt += ecnt[i];
386     }
387   }
388   *total_procs = ltotal_procs;
389   lproclist = (int*)malloc(ltotal_procs*sizeof(int));
390   *proclist = lproclist;
391   lproc_inv = (int*)malloc(nprocs*sizeof(int));
392   *proc_inv = lproc_inv;
393   licnt = (int*)malloc(ltotal_procs*sizeof(int));
394   *icnt = licnt;
395 
396 /**
397  * Set up conventional CSR arrays to hold portion of matrix owned by this
398  * processor
399  */
400   rval = (double*)malloc(ncnt*sizeof(double));
401   idbg = ncnt;
402   jval = (int*)malloc(ncnt*sizeof(int));
403   ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
404   ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
405 
406   for (i=0; i<ncnt; i++) {
407     rval[i] = 0.0;
408     jval[i] = 0;
409   }
410 
411   j = (imax-imin+2)*ltotal_procs;
412   for (i=0; i<j; i++) {
413     ival[i] = 0;
414     ivalt[i] = 0;
415   }
416 
417   j = 0;
418   for (i=0; i<nprocs; i++) {
419     lproc_inv[i] = -1;
420   }
421 /**
422  * voffset represents vertical partitions in the row block that divide it into
423  * blocks that couple to grid elements on other processors
424  */
425   lvoffset = (int*)malloc(ltotal_procs*sizeof(int));
426   *voffset = lvoffset;
427   lvoffset[0] = 0;
428   for (i=0; i<nprocs; i++) {
429     if (ecnt[i] > 0) {
430       lproclist[j] = i;
431       if (j > 0) {
432         lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1];
433       }
434       lproc_inv[i] = j;
435       j++;
436     }
437   }
438   free(ecnt);
439 
440   isize = imax-imin+2;
441   for (i=0; i<ltotal_procs; i++) {
442     licnt[i] = 0;
443   }
444   ltsize = 0;
445   for (i=imin; i<=imax; i++) {
446 /**
447  *   compute local indices of grid point corresponding to row i
448  */
449     indx = i - imin;
450     ix = indx%ldi;
451     indx = (indx - ix)/ldi;
452     iy = indx%ldj;
453     iz = (indx - iy)/ldj;
454     ix = ix + lo[0];
455     iy = iy + lo[1];
456     iz = iz + lo[2];
457 /**
458  *   find locations of neighbors in 7-point stencil (if they are on the grid)
459  */
460     ncnt = 0;
461     ixn[ncnt] = ix;
462     iyn[ncnt] = iy;
463     izn[ncnt] = iz;
464     il = ix - lo[0];
465     jl = iy - lo[1];
466     kl = iz - lo[2];
467     idx = kl*ldi*ldj + jl*ldi + il;
468     nghbrs[ncnt] = idx;
469     procid[ncnt] = me;
470     if (ix+1 <= idim - 1) {
471       ncnt++;
472       ixn[ncnt] = ix + 1;
473       iyn[ncnt] = iy;
474       izn[ncnt] = iz;
475       if (ix+1 > hi[0]) {
476         jdx = kp*pdi*pdj + jp*pdi + ip + 1;
477         il = 0;
478         jl = iy - lo[1];
479         kl = iz - lo[2];
480         ldpi = xld[ip+1];
481       } else {
482         jdx = me;
483         il = ix - lo[0] + 1;
484         jl = iy - lo[1];
485         kl = iz - lo[2];
486         ldpi = ldi;
487       }
488       idx = kl*ldpi*ldj + jl*ldpi + il;
489       nghbrs[ncnt] = idx;
490       procid[ncnt] = jdx;
491     }
492     if (ix-1 >= 0) {
493       ncnt++;
494       ixn[ncnt] = ix - 1;
495       iyn[ncnt] = iy;
496       izn[ncnt] = iz;
497       if (ix-1 < lo[0]) {
498         jdx = kp*pdi*pdj + jp*pdi + ip - 1;
499         il = xld[ip-1] - 1;
500         jl = iy - lo[1];
501         kl = iz - lo[2];
502         ldmi = xld[ip-1];
503       } else {
504         jdx = me;
505         il = ix - lo[0] - 1;
506         jl = iy - lo[1];
507         kl = iz - lo[2];
508         ldmi = ldi;
509       }
510       idx = kl*ldmi*ldj + jl*ldmi + il;
511       nghbrs[ncnt] = idx;
512       procid[ncnt] = jdx;
513     }
514     if (iy+1 <= jdim-1) {
515       ncnt++;
516       ixn[ncnt] = ix;
517       iyn[ncnt] = iy + 1;
518       izn[ncnt] = iz;
519       if (iy+1 > hi[1]) {
520         jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
521         il = ix - lo[0];
522         jl = 0;
523         kl = iz - lo[2];
524         ldpj = yld[jp+1];
525       } else {
526         jdx = me;
527         il = ix - lo[0];
528         jl = iy - lo[1] + 1;
529         kl = iz - lo[2];
530         ldpj = ldj;
531       }
532       idx = kl*ldi*ldpj + jl*ldi + il;
533       nghbrs[ncnt] = idx;
534       procid[ncnt] = jdx;
535     }
536     if (iy-1 >= 0) {
537       ncnt++;
538       ixn[ncnt] = ix;
539       iyn[ncnt] = iy - 1;
540       izn[ncnt] = iz;
541       if (iy-1 < lo[1]) {
542         jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
543         il = ix - lo[0];
544         jl = yld[jp-1] - 1;
545         kl = iz - lo[2];
546         ldmj = yld[jp-1];
547       } else {
548         jdx = me;
549         il = ix - lo[0];
550         jl = iy - lo[1] - 1;
551         kl = iz - lo[2];
552         ldmj = ldj;
553       }
554       idx = kl*ldi*ldmj + jl*ldi + il;
555       nghbrs[ncnt] = idx;
556       procid[ncnt] = jdx;
557     }
558     if (iz+1 <= kdim-1) {
559       ncnt++;
560       ixn[ncnt] = ix;
561       iyn[ncnt] = iy;
562       izn[ncnt] = iz + 1;
563       if (iz+1 > hi[2]) {
564         jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
565         il = ix - lo[0];
566         jl = iy - lo[1];
567         kl = 0;
568       } else {
569         jdx = me;
570         il = ix - lo[0];
571         jl = iy - lo[1];
572         kl = iz - lo[2] + 1;
573       }
574       idx = kl*ldi*ldj + jl*ldi + il;
575       nghbrs[ncnt] = idx;
576       procid[ncnt] = jdx;
577     }
578     if (iz-1 >= 0) {
579       ncnt++;
580       ixn[ncnt] = ix;
581       iyn[ncnt] = iy;
582       izn[ncnt] = iz - 1;
583       if (iz-1 < lo[2]) {
584         jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
585         il = ix - lo[0];
586         jl = iy - lo[1];
587         kl = zld[kp-1] - 1;
588       } else {
589         jdx = me;
590         il = ix - lo[0];
591         jl = iy - lo[1];
592         kl = iz - lo[2] - 1;
593       }
594       idx = kl*ldi*ldj + jl*ldi + il;
595       nghbrs[ncnt] = idx;
596       procid[ncnt] = jdx;
597     }
598 /**
599  *  sort j indices. This uses a simple bubble sort, but ncnt should be small. A
600  *  more sophisticated approach could be taken if this is too time consuming.
601  */
602     ncnt++;
603     for (j=0; j<ncnt; j++) {
604       for (k=j+1; k<ncnt; k++) {
605         if (nghbrs[j] > nghbrs[k]) {
606           itmp = nghbrs[j];
607           nghbrs[j] = nghbrs[k];
608           nghbrs[k] = itmp;
609           itmp = ixn[j];
610           ixn[j] = ixn[k];
611           ixn[k] = itmp;
612           itmp = iyn[j];
613           iyn[j] = iyn[k];
614           iyn[k] = itmp;
615           itmp = izn[j];
616           izn[j] = izn[k];
617           izn[k] = itmp;
618           itmp = procid[j];
619           procid[j] = procid[k];
620           procid[k] = itmp;
621         }
622       }
623     }
624     for (k=0; k<ncnt; k++) {
625       if (nghbrs[k] < 0 || nghbrs[k] >= ntot) {
626         printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]);
627       }
628     }
629 /**
630  *  create array elements
631  *   for (j=0; j<ltotal_procs; j++) {
632  *     printf("p[%d] lvoffset[%d]: %d\n",me,j,lvoffset[j]);
633  *   }
634  */
635     for (j=0; j<ncnt; j++) {
636       jdx = procid[j];
637       idx = lproc_inv[jdx];
638       if (ix == ixn[j] && iy == iyn[j] && iz == izn[j]) {
639         rval[lvoffset[idx]+licnt[idx]] = 6.0;
640       } else {
641         rval[lvoffset[idx]+licnt[idx]] = -1.0;
642       }
643       if (lvoffset[idx]+licnt[idx]>=idbg) {
644       }
645       /* TODO: Check this carefully */
646       jval[lvoffset[idx]+licnt[idx]] = nghbrs[j];
647       ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1;
648       licnt[idx]++;
649     }
650   }
651   for (i=0; i<ltotal_procs; i++) {
652     ival[i*isize] = lvoffset[i];
653     for (j=1; j<isize; j++) {
654       ival[i*isize+j] = ival[i*isize+j-1] + ivalt[i*isize+j-1];
655     }
656   }
657   free(ivalt);
658   isize = 0;
659   for (i=0; i<ltotal_procs; i++) {
660     isize = isize + licnt[i];
661   }
662   ltsize = isize;
663   GA_Igop(&ltsize,one,"+");
664 /**
665  *   Local portion of sparse matrix has been evaluated and decomposed into blocks
666  *   that match partitioning of right hand side across processors. The following
667  *   data is available at this point:
668  *      1) total_procs: the number of processors that are coupled to this one via
669  *         the sparse matrix
670  *      2) proc_list(total_procs): a list of processor IDs that are coupled to
671  *         this processor
672  *      3) proc_inv(nprocs): The entry in proc_list that corresponds to a given
673  *         processor. If the entry is zero then that processor does not couple to
674  *         this processor.
675  *      4) icnt(total_procs): The number of non-zero entries in the sparse matrix
676  *         that couple the process represented by proc_list(j) to this process
677  *      5) voffset(total_procs): The offsets for the non-zero data in the arrays
678  *         rval and jval for the blocks that couple this processor to other
679  *         processes in proc_list
680  *      6) offset(nprocs): the offset array for the distributed right hand side
681  *         vector
682  *      7) nsize(nprocs): number of vector elements per processor for right hand
683  *         side vector (equivalent to the number of rows per processor)
684  *
685  *    These arrays describe how the sparse matrix is layed out both locally and
686  *    across processors. In addition, the actual data for the distributed sparse
687  *    matrix is found in the following arrays:
688  *      1) rval: values of matrix for all blocks on this processor
689  *      2) jval: j-indices of matrix for all blocks on this processor
690  *      3) ival(total_procs*(nsize(me)+1)): starting index in rval and jval for
691  *         each row in each block
692  *
693  *    create global arrays to hold sparse matrix
694  */
695   tmapc = (int*)malloc(nprocs*sizeof(int));
696   limapc = (int*)malloc(nprocs*sizeof(int));
697   for (i=0; i<nprocs; i++) {
698     tmapc[i] = 0;
699   }
700   for (i=0; i<ltotal_procs; i++) {
701     tmapc[me] = tmapc[me] + licnt[i];
702   }
703   GA_Igop(tmapc,nprocs,"+");
704   isize = 0;
705   for (i=0; i<nprocs; i++) {
706     isize += tmapc[i];
707   }
708   limapc[0] = 0;
709   for (i=1; i<nprocs; i++) {
710     limapc[i] = limapc[i-1] + tmapc[i-1];
711   }
712 
713   *g_data = NGA_Create_handle();
714   NGA_Set_data(*g_data,one,&isize,C_DBL);
715   NGA_Set_irreg_distr(*g_data,limapc,&nprocs);
716   status = NGA_Allocate(*g_data);
717 
718   *g_j = NGA_Create_handle();
719   NGA_Set_data(*g_j,one,&isize,C_INT);
720   NGA_Set_irreg_distr(*g_j,limapc,&nprocs);
721   status = NGA_Allocate(*g_j);
722 
723   for (i=0; i<nprocs; i++) {
724     tmapc[i] = 0;
725   }
726   tmapc[me] = tmapc[me] + (lnsize[me]+1)*ltotal_procs;
727   GA_Igop(tmapc,nprocs,"+");
728   ntot = 0;
729   for (i=0; i<nprocs; i++) {
730     ntot += tmapc[i];
731   }
732   limapc[0] = 0;
733   for (i=1; i<nprocs; i++) {
734     limapc[i] = limapc[i-1] + tmapc[i-1];
735   }
736 
737   *g_i = NGA_Create_handle();;
738   NGA_Set_data(*g_i,one,&ntot,C_INT);
739   NGA_Set_irreg_distr(*g_i,limapc,&nprocs);
740   status = NGA_Allocate(*g_i);
741 
742   free(tmapc);
743 
744   GA_Sync();
745 /**
746  *  fill global arrays with local data
747  */
748   NGA_Distribution(*g_data,me,&tlo,&thi);
749   ld = thi-tlo+1;
750   NGA_Put(*g_data, &tlo, &thi, rval, &ld);
751   NGA_Put(*g_j, &tlo, &thi, jval, &ld);
752 
753   NGA_Distribution(*g_i,me,&tlo,&thi);
754   ld = thi-tlo+1;
755   NGA_Put(*g_i, &tlo, &thi, ival, &ld);
756 
757   GA_Sync();
758 
759   free(rval);
760   free(jval);
761   free(ival);
762   free(xld);
763   free(yld);
764   free(zld);
765 
766   limapc[0] = 0;
767   for (i=1; i<nprocs; i++) {
768     limapc[i] = limapc[i-1] + lnsize[i-1];
769   }
770   *imapc = limapc;
771   *tsize = isize;
772   return;
773 }
774 
main(int argc,char ** argv)775 int main(int argc, char **argv) {
776   int nmax, nprocs, me;
777   int g_a_data, g_a_i, g_a_j, isize;
778   int g_b, g_c;
779   int i, j, jj, k, one, jcnt;
780   int chunk, kp1, ld;
781   int *p_i, *p_j;
782   double *p_data, *p_b, *p_c;
783   double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
784   double t_hypre_strct;
785   double prdot, dotga, dothypre, tempc;
786   double prtot, gatot, hypretot;
787   int status;
788   int idim, jdim, kdim, idum, memsize;
789   int jmin, jmax, lsize, ntot;
790   int heap=200000, fudge=100, stack=200000, ma_heap;
791   double *cbuf, *vector;
792   int pdi, pdj, pdk, ip, jp, kp, ncells;
793   int lo[3],hi[3];
794   int blo[3], bhi[3];
795   int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
796   int iproc, iblock, btot;
797   double *amat, *bvec;
798   int *ivec, *jvec;
799   int *proclist, *proc_inv, *icnt;
800   int *voffset, *nsize, *offset, *mapc;
801   int iloop;
802   int LOOPNUM = 100;
803 /**
804  *  Hypre declarations. These are only used if linking to Hypre library for
805  *  performance testing
806  */
807   int ierr;
808 #if USE_HYPRE
809   HYPRE_StructGrid grid;
810   HYPRE_StructStencil stencil;
811   HYPRE_StructMatrix matrix;
812   HYPRE_StructVector vec_x, vec_y;
813   int i4, j4, ndim, nelems, offsets[7][3];
814   int stencil_indices[7], hlo[3], hhi[3];
815   double weights[7];
816   double *values;
817   double alpha, beta;
818   int *rows, *cols;
819 #endif
820 /**
821  *  Intitialize a message passing library
822  */
823   one = 1;
824   MP_INIT(argc,argv);
825 /**
826  *    Initialize GA
827  *
828  *    There are 2 choices: ga_initialize or ga_initialize_ltd.
829  *    In the first case, there is no explicit limit on memory usage.
830  *    In the second, user can set limit (per processor) in bytes.
831  */
832   t_beg = GA_Wtime();
833   GA_Initialize();
834   t_ga_in = GA_Wtime() - t_beg;
835   GA_Dgop(&t_ga_in,one,"+");
836 #if 0
837   memsize = 500000000
838   call ga_initialize_ltd(memsize)
839   memsize = 500000
840   status = ma_init(MT_DBL,memsize,memsize)
841 #endif
842 
843   t_ga_tot = 0.0;
844   t_mult = 0.0;
845   t_get = 0.0;
846   t_hypre_strct = 0.0;
847   prtot = 0.0;
848   gatot = 0.0;
849   hypretot = 0.0;
850 
851   me = GA_Nodeid();
852   nprocs = GA_Nnodes();
853   if (me == 0) {
854    printf("Time to initialize GA:                                 %12.4f\n",
855           t_ga_in/((double)nprocs));
856   }
857 /**
858  *  we can also use GA_set_memory_limit BEFORE first ga_create call
859  */
860   ma_heap = heap + fudge;
861 /* call GA_set_memory_limit(util_mdtob(ma_heap)) */
862 
863   if (me == 0) {
864     printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
865   }
866 /**
867  *  Initialize the MA package
868  *     MA must be initialized before any global array is allocated
869  */
870   if (!MA_init(MT_DBL, stack, ma_heap)) GA_Error("ma_init failed",-1);
871 /**
872  *   create a sparse LMAX x LMAX matrix and two vectors of length
873  *   LMAX. The matrix is stored in compressed row format.
874  *   One of the vectors is filled with random data and the other
875  *   is filled with zeros.
876  */
877   idim = IMAX;
878   jdim = JMAX;
879   kdim = KMAX;
880   ntot = idim*jdim*kdim;
881   if (me == 0) {
882     printf("\nDimension of matrix: %d\n\n",ntot);
883   }
884   t_beg = GA_Wtime();
885   grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
886   if (me == 0) {
887     printf("\nProcessor grid configuration\n");
888     printf("  PDX: %d\n",pdi);
889     printf("  PDY: %d\n",pdj);
890     printf("  PDZ: %d\n\n",pdk);
891   }
892 
893   create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,
894                      &g_a_data,&g_a_i,&g_a_j,&total_procs,&proclist,&proc_inv,
895                      &icnt,&voffset,&nsize,&offset,&isize,&mapc);
896   t_cnstrct = GA_Wtime() - t_beg;
897   if (me == 0) {
898     printf("\nNumber of non-zero elements in compressed matrix: %d\n",isize);
899   }
900 
901   g_b = GA_Create_handle();
902   GA_Set_data(g_b,one,&ntot,C_DBL);
903   GA_Set_irreg_distr(g_b,mapc,&nprocs);
904   status = GA_Allocate(g_b);
905 /**
906  *  fill g_b with random values
907  */
908   NGA_Distribution(g_b,me,blo,bhi);
909   NGA_Access(g_b,blo,bhi,&p_b,&ld);
910   ld = bhi[0]-blo[0]+1;
911   btot = ld;
912   vector = (double*)malloc(ld*sizeof(double));
913   for (i=0; i<ld; i++) {
914     idum  = 0;
915     p_b[i] = ran3(&idum);
916     vector[i] = p_b[i];
917   }
918   NGA_Release(g_b,blo,bhi);
919   GA_Sync();
920 
921   g_c = GA_Create_handle();
922   NGA_Set_data(g_c,one,&ntot,C_DBL);
923   NGA_Set_irreg_distr(g_c,mapc,&nprocs);
924   status = GA_Allocate(g_c);
925   NGA_Zero(g_c);
926 #if USE_HYPRE
927 /**
928  *  Assemble HYPRE grid and use that to create matrix. Start by creating
929  *  grid partition
930  */
931   ndim = 3;
932   i = me;
933   ip = i%pdi;
934   i = (i-ip)/pdi;
935   jp = i%pdj;
936   kp = (i-jp)/pdj;
937   lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
938   if (ip < pdi-1) {
939     hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
940   } else {
941     hi[0] = idim - 1;
942   }
943   lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
944   if (jp < pdj-1) {
945     hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
946   } else {
947     hi[1] = jdim - 1;
948   }
949   lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
950   if (kp < pdk-1) {
951     hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
952   } else {
953     hi[2] = kdim - 1;
954   }
955 /**
956  *  Create grid
957  */
958   hlo[0] = lo[0];
959   hlo[1] = lo[1];
960   hlo[2] = lo[2];
961   hhi[0] = hi[0];
962   hhi[1] = hi[1];
963   hhi[2] = hi[2];
964   ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
965   ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
966   ierr = HYPRE_StructGridAssemble(grid);
967 /**
968  *  Create stencil
969  */
970   offsets[0][0] = 0;
971   offsets[0][1] = 0;
972   offsets[0][2] = 0;
973 
974   offsets[1][0] = 1;
975   offsets[1][1] = 0;
976   offsets[1][2] = 0;
977 
978   offsets[2][0] = 0;
979   offsets[2][1] = 1;
980   offsets[2][2] = 0;
981 
982   offsets[3][0] = 0;
983   offsets[3][1] = 0;
984   offsets[3][2] = 1;
985 
986   offsets[4][0] = -1;
987   offsets[4][1] = 0;
988   offsets[4][2] = 0;
989 
990   offsets[5][0] = 0;
991   offsets[5][1] = -1;
992   offsets[5][2] = 0;
993 
994   offsets[6][0] = 0;
995   offsets[6][1] = 0;
996   offsets[6][2] = -1;
997 
998   nelems = 7;
999   ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
1000   for (i=0; i<nelems; i++) {
1001     ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
1002   }
1003 
1004   ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1005   jcnt = 7*ncells;
1006   values = (double*)malloc(jcnt*sizeof(double));
1007   jcnt = 0;
1008   weights[0] = 6.0;
1009   weights[1] = -1.0;
1010   weights[2] = -1.0;
1011   weights[3] = -1.0;
1012   weights[4] = -1.0;
1013   weights[5] = -1.0;
1014   weights[6] = -1.0;
1015   for (i=0; i<ncells; i++) {
1016     for (j=0; j<7; j++) {
1017       values[jcnt] = weights[j];
1018       jcnt++;
1019     }
1020   }
1021 
1022   ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
1023   ierr = HYPRE_StructMatrixInitialize(matrix);
1024   for (i=0; i<7; i++) {
1025     stencil_indices[i] = i;
1026   }
1027   ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
1028   free(values);
1029 /**
1030  *  Check all six sides of current box to see if any are boundaries.
1031  *  Set values to zero if they are.
1032  */
1033   if (hi[0] == idim-1) {
1034     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1035     hlo[0] = idim-1;
1036     hhi[0] = idim-1;
1037     hlo[1] = lo[1];
1038     hhi[1] = hi[1];
1039     hlo[2] = lo[2];
1040     hhi[2] = hi[2];
1041     values = (double*)malloc(ncells*sizeof(double));
1042     for (i=0; i<ncells; i++) values[i] = 0.0;
1043     i4 = 1;
1044     j4 = 1;
1045     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1046     free(values);
1047   }
1048   if (hi[1] == jdim-1) {
1049     ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1050     hlo[0] = lo[0];
1051     hhi[0] = hi[0];
1052     hlo[1] = jdim-1;
1053     hhi[1] = jdim-1;
1054     hlo[2] = lo[2];
1055     hhi[2] = hi[2];
1056     values = (double*)malloc(ncells*sizeof(double));
1057     for (i=0; i<ncells; i++) values[i] = 0.0;
1058     i4 = 1;
1059     j4 = 2;
1060     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1061     free(values);
1062   }
1063   if (hi[2] == kdim-1) {
1064     ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
1065     hlo[0] = lo[0];
1066     hhi[0] = hi[0];
1067     hlo[1] = lo[1];
1068     hhi[1] = hi[1];
1069     hlo[2] = kdim-1;
1070     hhi[2] = kdim-1;
1071     values = (double*)malloc(ncells*sizeof(double));
1072     for (i=0; i<ncells; i++) values[i] = 0.0;
1073     i4 = 1;
1074     j4 = 3;
1075     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1076     free(values);
1077   }
1078   if (lo[0] == 0) {
1079     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1080     hlo[0] = 0;
1081     hhi[0] = 0;
1082     hlo[1] = lo[1];
1083     hhi[1] = hi[1];
1084     hlo[2] = lo[2];
1085     hhi[2] = hi[2];
1086     values = (double*)malloc(ncells*sizeof(double));
1087     for (i=0; i<ncells; i++) values[i] = 0.0;
1088     i4 = 1;
1089     j4 = 4;
1090     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1091     free(values);
1092   }
1093   if (lo[1] == 0) {
1094     ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1095     hlo[0] = lo[0];
1096     hhi[0] = hi[0];
1097     hlo[1] = 0;
1098     hhi[1] = 0;
1099     hlo[2] = lo[2];
1100     hhi[2] = hi[2];
1101     values = (double*)malloc(ncells*sizeof(double));
1102     for (i=0; i<ncells; i++) values[i] = 0.0;
1103     i4 = 1;
1104     j4 = 5;
1105     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1106     free(values);
1107   }
1108   if (lo[2] == 1) {
1109     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1110     hlo[0] = lo[0];
1111     hhi[0] = hi[0];
1112     hlo[1] = lo[1];
1113     hhi[1] = hi[1];
1114     hlo[2] = 0;
1115     hhi[2] = 0;
1116     values = (double*)malloc(ncells*sizeof(double));
1117     for (i=0; i<ncells; i++) values[i] = 0.0;
1118     i4 = 1;
1119     j4 = 6;
1120     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1121     free(values);
1122   }
1123   ierr = HYPRE_StructMatrixAssemble(matrix);
1124 /**
1125  *   Create vectors for matrix-vector multiply
1126  */
1127   ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
1128   ierr = HYPRE_StructVectorInitialize(vec_x);
1129   hlo[0] = lo[0];
1130   hlo[1] = lo[1];
1131   hlo[2] = lo[2];
1132   hhi[0] = hi[0];
1133   hhi[1] = hi[1];
1134   hhi[2] = hi[2];
1135   ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
1136   ierr = HYPRE_StructVectorAssemble(vec_x);
1137   NGA_Distribution(g_a_i,me,blo,bhi);
1138 
1139   if (bhi[1] > ntot-1) {
1140     bhi[1] = ntot-1;
1141   }
1142 
1143   btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1144 
1145   for (i=0; i<btot; i++) vector[i] = 0.0;
1146   hlo[0] = lo[0];
1147   hlo[1] = lo[1];
1148   hlo[2] = lo[2];
1149   hhi[0] = hi[0];
1150   hhi[1] = hi[1];
1151   hhi[2] = hi[2];
1152   ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);
1153 
1154   for (i=0; i<btot; i++) vector[i] = 0.0;
1155   ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
1156   ierr = HYPRE_StructVectorInitialize(vec_y);
1157   ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
1158   ierr = HYPRE_StructVectorAssemble(vec_y);
1159 #endif
1160 /**
1161  *  Multiply sparse matrix. Start by accessing pointers to local portions of
1162  *  g_a_data, g_a_j, g_a_i
1163  */
1164   for (iloop=0; iloop<LOOPNUM; iloop++) {
1165     t_beg2 = GA_Wtime();
1166     NGA_Distribution(g_a_data,me,blo,bhi);
1167     NGA_Access(g_a_data,blo,bhi,&p_data,&ld_a);
1168 
1169     NGA_Distribution(g_a_j,me,blo,bhi);
1170     NGA_Access(g_a_j,blo,bhi,&p_j,&ld_j);
1171 
1172     NGA_Distribution(g_a_i,me,blo,bhi);
1173     NGA_Access(g_a_i,blo,bhi,&p_i,&ld_i);
1174 
1175     NGA_Distribution(g_c,me,blo,bhi);
1176     NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
1177     for (i = 0; i<bhi[0]-blo[0]+1; i++) {
1178       p_c[i] = 0.0;
1179     }
1180     /**
1181      * Loop through matrix blocks
1182      */
1183     ioff = 0;
1184     for (iblock = 0; iblock<total_procs; iblock++) {
1185       iproc = proclist[iblock];
1186       NGA_Distribution(g_b,iproc,blo,bhi);
1187       bvec = (double*)malloc((bhi[0]-blo[0]+1)*sizeof(double));
1188       j = 0;
1189       t_beg = GA_Wtime();
1190       NGA_Get(g_b,blo,bhi,bvec,&j);
1191       t_get = t_get + GA_Wtime() - t_beg;
1192       t_beg = GA_Wtime();
1193       irows = nsize[me]-1;
1194       for (i=0; i<=irows; i++) {
1195         jmin = p_i[ioff+i];
1196         jmax = p_i[ioff+i+1]-1;
1197         tempc = 0.0;
1198         for (j = jmin; j<=jmax; j++) {
1199           jj = p_j[j];
1200           tempc = tempc + p_data[j]*bvec[jj];
1201         }
1202         p_c[i] = p_c[i] + tempc;
1203       }
1204       ioff = ioff + nsize[me] + 1;
1205       t_mult = t_mult + GA_Wtime() - t_beg;
1206       free(bvec);
1207     }
1208     t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;
1209 
1210     NGA_Distribution(g_a_data,me,blo,bhi);
1211     NGA_Release(g_a_data,blo,bhi);
1212     NGA_Distribution(g_a_j,me,blo,bhi);
1213     NGA_Release(g_a_j,blo,bhi);
1214     NGA_Distribution(g_a_i,me,blo,bhi);
1215     NGA_Release(g_a_i,blo,bhi);
1216     NGA_Distribution(g_c,me,blo,bhi);
1217     NGA_Release(g_c,blo,bhi);
1218 
1219 #if USE_HYPRE
1220     alpha = 1.0;
1221     beta = 0.0;
1222     t_beg = GA_Wtime();
1223     ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1224     t_hypre_strct = GA_Wtime() - t_beg;
1225     hlo[0] = lo[0];
1226     hlo[1] = lo[1];
1227     hlo[2] = lo[2];
1228     hhi[0] = hi[0];
1229     hhi[1] = hi[1];
1230     hhi[2] = hi[2];
1231     ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1232     NGA_Distribution(g_c,me,lo,hi);
1233     cbuf = (double*)malloc((hi[0]-lo[0]+1)*sizeof(double));
1234     NGA_Get(g_c,lo,hi,cbuf,&one);
1235     prdot = 0.0;
1236     dotga = 0.0;
1237     dothypre = 0.0;
1238     for (i=0; i<(hi[0]-lo[0]+1); i++) {
1239       dothypre = dothypre + vector[i]*vector[i];
1240       dotga = dotga + cbuf[i]*cbuf[i];
1241       prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1242     }
1243     NGA_Dgop(&dotga,1,"+");
1244     NGA_Dgop(&dothypre,1,"+");
1245     NGA_Dgop(&prdot,1,"+");
1246     gatot += sqrt(dotga);
1247     hypretot += sqrt(dothypre);
1248     prtot += sqrt(prdot);
1249     free(cbuf);
1250 #endif
1251   }
1252 #if USE_HYPRE
1253   if (me == 0) {
1254     printf("Magnitude of GA solution:                         %e\n",
1255         gatot/((double)LOOPNUM));
1256     printf("Magnitude of HYPRE solution:                      %e\n",
1257         hyprtot/((double)LOOPNUM));
1258     printf("Difference between GA and HYPRE (Struct) results: %e\n",
1259         prtot/((double)LOOPNUM));
1260   }
1261 #endif
1262 
1263   free(vector);
1264 /**
1265  *  Clean up arrays
1266  */
1267   NGA_Destroy(g_b);
1268   NGA_Destroy(g_c);
1269   NGA_Destroy(g_a_data);
1270   NGA_Destroy(g_a_i);
1271   NGA_Destroy(g_a_j);
1272 #if USE_HYPRE
1273 #if USE_STRUCT
1274   ierr = HYPRE_StructStencilDestroy(stencil);
1275   ierr = HYPRE_StructGridDestroy(grid);
1276   ierr = HYPRE_StructMatrixDestroy(matrix);
1277   ierr = HYPRE_StructVectorDestroy(vec_x);
1278   ierr = HYPRE_StructVectorDestroy(vec_y);
1279 #endif
1280 #endif
1281 
1282   NGA_Dgop(&t_cnstrct,1,"+");
1283   NGA_Dgop(&t_get,1,"+");
1284   NGA_Dgop(&t_mult,1,"+");
1285   NGA_Dgop(&t_ga_tot,1,"+");
1286 #if USE_HYPRE
1287   NGA_Dgop(&t_hypre_strct,1,"+");
1288 #endif
1289   free(proclist);
1290   free(proc_inv);
1291   free(icnt);
1292   free(voffset);
1293   free(nsize);
1294   free(offset);
1295   free(mapc);
1296 
1297   if (me == 0) {
1298     printf("Time to create sparse matrix:                         %12.4f\n",
1299       t_cnstrct/((double)nprocs));
1300     printf("Time to get right hand side vector:                   %12.4f\n",
1301       t_get/((double)nprocs));
1302     printf("Time for sparse matrix block multiplication:          %12.4f\n",
1303       t_mult/((double)nprocs));
1304     printf("Time for total sparse matrix multiplication:          %12.4f\n",
1305       t_ga_tot/((double)nprocs));
1306 #if USE_HYPRE
1307 #if USE_STRUCT
1308     printf("Total time for HYPRE (Struct)  matrix-vector multiply:%12.4f\n",
1309       t_hypre_strct/((double)nprocs));
1310 #endif
1311 #endif
1312   }
1313   NGA_Terminate();
1314 /**
1315  *  Tidy up after message-passing library
1316  */
1317   MP_FINALIZE();
1318 }
1319