1 
2 #include <stdio.h>
3 
4 #include "macdecls.h"
5 #include "ga.h"
6 #include "gp.h"
7 #define USE_HYPRE 0
8 #define IMAX 100
9 #define JMAX 100
10 #define KMAX 100
11 #define LMAX IMAX*JMAX*KMAX
12 #define MAXVEC 5000000
13 #define EPSLN 1.0d-10
14 #define CHECK_BOUND 1
15 
16 #define bb_a(ib) bb_v(bb_i + (ib))
17 #define cc_a(ib) cc_v(cc_i + (ib))
18 
19 #define Integer int
20 #define MAX_FACTOR 10000
21 
22 #if USE_HYPRE
23 #include "HYPRE.h"
24 #include "HYPRE_struct_mv.h"
25 #include "mpi.h"
26 #endif
27 
grid_factor(int p,int xdim,int ydim,int zdim,int * idx,int * idy,int * idz)28 void grid_factor(int p, int xdim, int ydim, int zdim, int *idx, int *idy, int *idz) {
29   int i, j;
30   int ip, ifac, pmax, prime[MAX_FACTOR];
31   int fac[MAX_FACTOR];
32   int ix, iy, iz, ichk;
33 
34   i = 1;
35 /*
36      factor p completely
37      first, find all prime numbers, besides 1, less than or equal to
38      the square root of p
39 */
40   ip = (int)(sqrt((double)p))+1;
41   pmax = 0;
42   for (i=2; i<=ip; i++) {
43     ichk = 1;
44     for (j=0; j<pmax; j++) {
45       if (i%prime[j] == 0) {
46         ichk = 0;
47         break;
48       }
49     }
50     if (ichk) {
51       pmax = pmax + 1;
52       if (pmax > MAX_FACTOR) printf("Overflow in grid_factor\n");
53       prime[pmax-1] = i;
54     }
55   }
56 /*
57      find all prime factors of p
58 */
59   ip = p;
60   ifac = 0;
61   for (i=0; i<pmax; i++) {
62     while(ip%prime[i] == 0) {
63       ifac = ifac + 1;
64       fac[ifac-1] = prime[i];
65       ip = ip/prime[i];
66     }
67   }
68 /*
69   p is prime
70  */
71   if (ifac==0) {
72     ifac++;
73     fac[0] = p;
74   }
75 
76 /*
77      determine three factors of p of approximately the
78      same size
79 */
80   *idx = 1;
81   *idy = 1;
82   *idz = 1;
83   for (i = ifac-1; i >= 0; i--) {
84     ix = xdim/(*idx);
85     iy = ydim/(*idy);
86     iz = zdim/(*idz);
87     if (ix >= iy && ix >= iz && ix > 1) {
88       *idx = fac[i]*(*idx);
89     } else if (iy >= ix && iy >= iz && iy > 1) {
90       *idy = fac[i]*(*idy);
91     } else if (iz >= ix && iz >= iy && iz > 1) {
92       *idz = fac[i]*(*idz);
93     } else {
94       printf("Too many processors in grid factoring routine\n");
95     }
96   }
97 }
98 
99 /*
100    Short subroutine for multiplying sparse matrix block with vector segment
101 */
loc_matmul(double * a_mat,int * jvec,int * ivec,double * bvec,double * cvec,int nrows)102 void loc_matmul(double *a_mat, int *jvec, int *ivec,
103                 double *bvec, double *cvec, int nrows) {
104   double tempc;
105   int i, j, jj, jmin,jmax;
106   for (i=0; i<nrows; i++) {
107     jmin = ivec[i];
108     jmax = ivec[i+1]-1;
109     tempc = 0.0;
110     for (j=jmin; j<jmax; j++) {
111       jj = jvec[j];
112       tempc = tempc + a_mat[j]*bvec[jj];
113     }
114     cvec[i] = cvec[i] + tempc;
115   }
116 }
117 /*
118     Random number generator
119 */
ran3(int * idum)120 double ran3(int *idum) {
121   static int iff = 0;
122   if (*idum < 0 || iff == 0) {
123     iff = 1;
124     srand(abs(*idum));
125     *idum = 1;
126   }
127   return ((double)rand())/((double)RAND_MAX);
128 }
129 
130 /*
131  * A serial routine for transposing a matrix stored in a standard Compressed
132  * Sparse Row (CSR) format. The routine is based on compiling a linked list of
133  * the elements in each row and then using that to perform the transpose. The
134  * ordering of the matrix elements in the transposed matrix is not monotonic,
135  * that would require an additional sort on each row of the transposed matrix.
136  */
stran(int nrows,int ncols,int * i_in,int * j_in,int * val_in,int * i_out,int * j_out,int * val_out)137 void stran(int nrows, int ncols, int *i_in, int* j_in, int *val_in,
138            int *i_out, int *j_out, int *val_out)
139 {
140   int i, j, jj, jcnt;
141   int nnz, jmin, jmax;
142   int *header, *link, *jdx;
143 
144   /* Create link list for all column entries */
145   nnz = i_in[nrows];
146   header = (int*)malloc(ncols*sizeof(int));
147   link = (int*)malloc(nnz*sizeof(int));
148   jdx = (int*)malloc(nnz*sizeof(int));
149   for (i=0; i<ncols; i++) header[i] = -1;
150   for (i=0; i<nnz; i++) link[i] = -1;
151   jcnt = 0;
152   for (i=0; i<nrows; i++) {
153     jmin = i_in[i];
154     jmax = i_in[i+1];
155     for (j=jmin; j<jmax; j++) {
156       jj = j_in[j];
157       link[jcnt] = header[jj];
158       jdx[jcnt] = i;
159       header[jj] = jcnt;
160       jcnt++;
161     }
162   }
163 
164   /* Use link-list to evaluate transpose */
165   jj = 0;
166   for (i=0; i<ncols; i++) {
167     i_out[i] = jj;
168     j = header[i];
169     while (j != -1) {
170       j_out[jj] = jdx[j];
171       val_out[jj] = val_in[j];
172       j = link[j];
173       jj++;
174     }
175   }
176   i_out[ncols] = nnz;
177   free(jdx);
178   free(link);
179   free(header);
180 }
181 
182 /* serial transpose code for real matrices */
stranr(int nrows,int ncols,int * i_in,int * j_in,double * val_in,int * i_out,int * j_out,double * val_out)183 void stranr(int nrows, int ncols, int *i_in, int* j_in, double *val_in,
184            int *i_out, int *j_out, double *val_out)
185 {
186   int i, j, jj, jcnt;
187   int nnz, jmin, jmax;
188   int *header, *link, *jdx;
189 
190   /* Create link list for all column entries */
191   nnz = i_in[nrows];
192   header = (int*)malloc(ncols*sizeof(int));
193   link = (int*)malloc(nnz*sizeof(int));
194   jdx = (int*)malloc(nnz*sizeof(int));
195   for (i=0; i<ncols; i++) header[i] = -1;
196   for (i=0; i<nnz; i++) link[i] = -1;
197   jcnt = 0;
198   for (i=0; i<nrows; i++) {
199     jmin = i_in[i];
200     jmax = i_in[i+1];
201     for (j=jmin; j<jmax; j++) {
202       jj = j_in[j];
203       link[jcnt] = header[jj];
204       jdx[jcnt] = i;
205       header[jj] = jcnt;
206       jcnt++;
207     }
208   }
209 
210   /* Use link-list to evaluate transpose */
211   jj = 0;
212   for (i=0; i<ncols; i++) {
213     i_out[i] = jj;
214     j = header[i];
215     while (j != -1) {
216       j_out[jj] = jdx[j];
217       val_out[jj] = val_in[j];
218       j = link[j];
219       jj++;
220     }
221   }
222   i_out[ncols] = nnz;
223   free(jdx);
224   free(link);
225   free(header);
226 }
227 
228 /*
229     create a random sparse matrix in compressed row form corresponding to a
230     7-point stencil for a grid on a lattice of dimension idim X jdim X kdim grid
231     points
232 */
create_laplace_mat(int idim,int jdim,int kdim,int pdi,int pdj,int pdk,int * gp_block,int * g_j,int * g_i,int ** imapc)233 void create_laplace_mat(int idim, int jdim, int kdim, int pdi, int pdj, int pdk,
234                         int *gp_block, int *g_j, int *g_i, int **imapc) {
235 /*
236     idim: i-dimension of grid
237     jdim: j-dimension of grid
238     kdim: k-dimension of grid
239     pdi: i-dimension of processor grid
240     pdj: j-dimension of processor grid
241     pdk: k-dimension of processor grid
242 !    g_data: global array of values
243 !    g_j: global array containing j indices (using local indices)
244 !    g_i: global array containing starting location of each row in g_j
245 !         (using local indices)
246     gp_block: global pointer array containing non-zero sparse sub-blocks of
247               matrix
248     g_j: global array containing j indices of sub-blocks
249     g_i: global array containing starting location of each row in g_j
250     tsize: total number of non-zero elements in matrix
251     imapc: map array for vectors
252 */
253   int ltotal_procs;
254   int *lproclist, *lproc_inv,  *lvoffset, *lnsize, *loffset, *licnt, *limapc;
255   int *nnz_list;
256   int nnz, offset, b_nnz;
257   int nprocs, me, imin, imax, jcnt;
258   int *jmin, *jmax;
259   int ix, iy, iz, idx;
260   double x, dr;
261   double *rval, *gp_rval;
262   int isize, idbg;
263   int *jval, *gp_jval, *ival, *gp_ival, *ivalt;
264   int i, j, k, itmp, one, tlo, thi, ld;
265   int idum, ntot, indx, nghbrs[7], ncnt, nsave;
266   int ixn[7],iyn[7],izn[7], procid[7];
267   int status;
268   int lo[3], hi[3], ip, jp, kp, ldi, ldj, jdx, joff;
269   int il, jl, kl, ldmi, ldpi, ldmj, ldpj;
270   int *xld, *yld, *zld, *tmapc;
271   int *ecnt, *total_distr;
272   int total_max, toffset;
273   int *iparams, *blk_ptr;
274   int *iparamst, *jvalt;
275   double *rvalt;
276   FILE *fp, *fopen();
277 
278   me = NGA_Nodeid();
279   nprocs = NGA_Nnodes();
280   idum = -(12345+me);
281   x = ran3(&idum);
282   one = 1;
283 
284   if (me == 0) {
285     printf("\n Dimension of grid: \n\n");
286     printf(" I Dimension: %d\n",idim);
287     printf(" J Dimension: %d\n",jdim);
288     printf(" K Dimension: %d\n\n",kdim);
289   }
290 /*
291    Find position of processor in processor grid and calulate minimum
292    and maximum values of indices
293 */
294   i = me;
295   ip = i%pdi;
296   i = (i-ip)/pdi;
297   jp = i%pdj;
298   kp = (i-jp)/pdj;
299 
300   lo[0] = (int)((((double)idim)*((double)ip))/((double)pdi));
301   if (ip < pdi-1) {
302     hi[0] = (int)((((double)idim)*((double)(ip+1)))/((double)pdi))-1;
303   } else {
304     hi[0] = idim - 1;
305   }
306 
307   lo[1] = (int)((((double)jdim)*((double)jp))/((double)pdj));
308   if (jp < pdj-1) {
309     hi[1] = (int)((((double)jdim)*((double)(jp+1)))/((double)pdj))-1;
310   } else {
311     hi[1] = jdim - 1;
312   }
313 
314   lo[2] = (int)((((double)kdim)*((double)kp))/((double)pdk));
315   if (kp < pdk-1) {
316     hi[2] = (int)((((double)kdim)*((double)(kp+1)))/((double)pdk))-1;
317   } else {
318     hi[2] = kdim - 1;
319   }
320 
321   ldi = hi[0]-lo[0]+1;
322   ldj = hi[1]-lo[1]+1;
323 
324   /* Evaluate xld, yld, zld. These contain the number of elements in each
325      division along the x, y, z axes */
326   xld = (int*)malloc(pdi*sizeof(int));
327   for (i=0; i<pdi; i++) {
328     if (i<pdi-1) {
329       xld[i] = (int)((((double)idim)*((double)(i+1)))/((double)pdi));
330     } else {
331       xld[i] = idim;
332     }
333     xld[i] = xld[i] - (int)((((double)idim)*((double)(i)))/((double)pdi));
334   }
335 
336   yld = (int*)malloc(pdj*sizeof(int));
337   for (i=0; i<pdj; i++) {
338     if (i<pdj-1) {
339       yld[i] = (int)((((double)jdim)*((double)(i+1)))/((double)pdj));
340     } else {
341       yld[i] = jdim;
342     }
343     yld[i] = yld[i] - (int)((((double)jdim)*((double)(i)))/((double)pdj));
344   }
345 
346   zld = (int*)malloc(pdk*sizeof(int));
347   for (i=0; i<pdk; i++) {
348     if (i<pdk-1) {
349       zld[i] = (int)((((double)kdim)*((double)(i+1)))/((double)pdk));
350     } else {
351       zld[i] = jdim;
352     }
353     zld[i] = zld[i] - (int)((((double)kdim)*((double)(i)))/((double)pdk));
354   }
355 
356 /* Determine number of rows per processor
357    lnsize[i]: number of rows associated with process i
358    loffset[i]: global offset to location of first row associated
359                with process i */
360 
361   lnsize = (int*)malloc(nprocs*sizeof(int));
362   loffset = (int*)malloc(nprocs*sizeof(int));
363   for (i=0; i<nprocs; i++) {
364     lnsize[i] = 0;
365     loffset[i] = 0;
366   }
367   lnsize[me] = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
368   NGA_Igop(lnsize,nprocs,"+");
369   loffset[0] = 0;
370   for (i=1; i<nprocs; i++) {
371     loffset[i] = loffset[i-1] + lnsize[i-1];
372   }
373 
374   ntot = idim*jdim*kdim;
375   NGA_Sync();
376 /*
377     scan over rows of lattice
378     imin: minimum global index of rows associated with this process (me)
379     imax: maximum global index of rows associated with this process (me)
380 */
381   imin = loffset[me];
382   imax = loffset[me]+lnsize[me]-1;
383   free(loffset);
384 /*
385     find out how many other processors couple to this row of blocks
386     ecnt[i]: the number of columns on processor i that are coupled to this
387     process
388 */
389   ecnt = (int*)malloc(nprocs*sizeof(int));
390   for (i=0; i<nprocs; i++) {
391     ecnt[i] = 0;
392   }
393 
394   for (i=imin; i<=imax; i++) {
395 /*
396     compute local indices of grid point corresponding to row i
397 */
398     indx = i - imin;
399     ix = indx%ldi;
400     indx = (indx - ix)/ldi;
401     iy = indx%ldj;
402     iz = (indx - iy)/ldj;
403     ix = ix + lo[0];
404     iy = iy + lo[1];
405     iz = iz + lo[2];
406 
407     ecnt[me] = ecnt[me] + 1;
408     if (ix+1 <= idim-1) {
409       if (ix+1 > hi[0]) {
410         jdx = kp*pdi*pdj + jp*pdi + ip + 1;
411         ecnt[jdx] = ecnt[jdx] + 1;
412       } else {
413         ecnt[me] = ecnt[me] + 1;
414       }
415     }
416     if (ix-1 >= 0) {
417       if (ix-1 < lo[0]) {
418         jdx = kp*pdi*pdj + jp*pdi + ip - 1;
419         ecnt[jdx] = ecnt[jdx] + 1;
420       } else {
421         ecnt[me] = ecnt[me] + 1;
422       }
423     }
424     if (iy+1 <= jdim-1) {
425       if (iy+1 > hi[1]) {
426         jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
427         ecnt[jdx] = ecnt[jdx] + 1;
428       } else {
429         ecnt[me] = ecnt[me] + 1;
430       }
431     }
432     if (iy-1 >= 0) {
433       if (iy-1 < lo[1]) {
434         jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
435         ecnt[jdx] = ecnt[jdx] + 1;
436       } else {
437         ecnt[me] = ecnt[me] + 1;
438       }
439     }
440     if (iz+1 <= kdim-1) {
441       if (iz+1 > hi[2]) {
442         jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
443         ecnt[jdx] = ecnt[jdx] + 1;
444       } else {
445         ecnt[me] = ecnt[me] + 1;
446       }
447     }
448     if (iz-1 >= 0) {
449       if (iz-1 < lo[2]) {
450         jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
451         ecnt[jdx] = ecnt[jdx] + 1;
452       } else {
453         ecnt[me] = ecnt[me] + 1;
454       }
455     }
456   }
457 
458 /* Create list of processors that this processor is coupled to.
459    If ecnt[i] is greater than zero then process i is coupled to this process.
460    ltotal_procs: the total number of other processor that this process is coupled
461                  to. This includes this process (the diagonal term).
462    lproclist[i]: the IDs of the processor that this processor is coupled to
463    lproc_inv[i]: the location in lproclist of processor i. If processor i is not
464                  coupled to this process, the lproc_inv[i] = -1
465    ncnt: total number of non-zero elements held by this process
466    nnz_list[i]: number of processes coupled to process i by sparse blocks
467    nnz: total number of sparse blocks */
468 
469   ltotal_procs = 0;
470   ncnt = 0;
471   for (i=0; i<nprocs; i++) {
472     if (ecnt[i] > 0) {
473       ltotal_procs++;
474       ncnt += ecnt[i];
475     }
476   }
477   nsave = ncnt;
478 
479   lproclist = (int*)malloc(ltotal_procs*sizeof(int));
480   lproc_inv = (int*)malloc(nprocs*sizeof(int));
481   licnt = (int*)malloc(ltotal_procs*sizeof(int));
482   for (i=0; i<ltotal_procs; i++) {
483     licnt[i] = 0;
484   }
485 
486   rval = (double*)malloc(ncnt*sizeof(double));
487   idbg = ncnt;
488   jval = (int*)malloc(ncnt*sizeof(int));
489   ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
490   ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
491 
492   for (i=0; i<ncnt; i++) {
493     rval[i] = 0.0;
494     jval[i] = 0;
495   }
496 
497   j = (imax-imin+2)*ltotal_procs;
498   for (i=0; i<j; i++) {
499     ival[i] = 0;
500     ivalt[i] = 0;
501   }
502 
503   nnz_list = (int*)malloc(nprocs*sizeof(int));
504   for (i=0; i<nprocs; i++) {
505     nnz_list[i] = 0;
506   }
507 
508   /* nnz is total number of non-zero sparse blocks */
509   nnz_list[me] = ltotal_procs;
510   NGA_Igop(nnz_list, nprocs, "+");
511   nnz = 0;
512   for (i=0; i<nprocs; i++) {
513     nnz += nnz_list[i];
514   }
515 
516 /*  lvoffset[i]: local offset into array ival[i] to get to elements associated
517     with block i (i runs from 0 to ltotal_procs-1)
518     isize: number of rows (plus 1) that reside on this processor */
519   isize = (imax-imin+2);
520   for (i=0; i<nprocs; i++) {
521     lproc_inv[i] = -1;
522   }
523   lvoffset = (int*)malloc(ltotal_procs*sizeof(int));
524   lvoffset[0] = 0;
525   j = 0;
526   for (i=0; i<nprocs; i++) {
527     if (ecnt[i] > 0) {
528       lproclist[j] = i;
529       if (j > 0) {
530         lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1];
531       }
532       lproc_inv[i] = j;
533       j++;
534     }
535   }
536 
537 /* Create arrays the hold the sparse block representation of the sparse matrix
538    gp_block[nnz]: Global Pointer array holding the sparse sub-matrices
539    g_j[nnz]: column block indices for the element in gp_block
540    g_i[nprocs]: row block indices for the elements in g_j */
541 
542   tmapc = (int*)malloc((nprocs+1)*sizeof(int));
543   tmapc[0] = 0;
544   for (i=1; i<=nprocs; i++) {
545     tmapc[i] = tmapc[i-1]+nnz_list[i-1];
546   }
547   *gp_block = GP_Create_handle();
548   GP_Set_dimensions(*gp_block,one,&nnz);
549   GP_Set_irreg_distr(*gp_block, tmapc, &nprocs);
550   GP_Allocate(*gp_block);
551 
552   *g_j = NGA_Create_handle();
553   NGA_Set_data(*g_j,one,&nnz,C_INT);
554   NGA_Set_irreg_distr(*g_j, tmapc, &nprocs);
555   NGA_Allocate(*g_j);
556 
557   for (i=0; i<nprocs; i++) {
558     tmapc[i] = i;
559   }
560   *g_i = NGA_Create_handle();
561   i = nprocs+1;
562   NGA_Set_data(*g_i,one,&i,C_INT);
563   NGA_Set_irreg_distr(*g_i, tmapc, &nprocs);
564   NGA_Allocate(*g_i);
565   free(tmapc);
566 
567   jmin = (int*)malloc(nprocs*sizeof(int));
568   jmax = (int*)malloc(nprocs*sizeof(int));
569   for (i=0; i<nprocs; i++) {
570     jmin[i] = 0;
571     jmax[i] = 0;
572   }
573   jmin[me] = imin;
574   jmax[me] = imax;
575   NGA_Igop(jmin, nprocs, "+");
576   NGA_Igop(jmax, nprocs, "+");
577 
578 /*
579    Create the sparse blocks holding actual data. All the elements within each block
580    couple this processor to one other processor
581    rval[i]: values of matrix elements
582    jval[i]: column indices of matrix elements
583    ival[i]: index of first elements in rval and jval for the row represented by
584             the index i.
585    ivalt[i]: temporary array used in the construction of ival[i]
586 */
587   for (i=imin; i<=imax; i++) {
588     /*
589     compute local indices of grid point corresponding to row i
590      */
591     indx = i - imin;
592     ix = indx%ldi;
593     indx = (indx - ix)/ldi;
594     iy = indx%ldj;
595     iz = (indx - iy)/ldj;
596     ix = ix + lo[0];
597     iy = iy + lo[1];
598     iz = iz + lo[2];
599     /*
600     find locations of neighbors in 7-point stencil (if they are on the grid)
601      */
602     ncnt = 0;
603     ixn[ncnt] = ix;
604     iyn[ncnt] = iy;
605     izn[ncnt] = iz;
606     il = ix - lo[0];
607     jl = iy - lo[1];
608     kl = iz - lo[2];
609     idx = kl*ldi*ldj + jl*ldi + il;
610     nghbrs[ncnt] = idx;
611     procid[ncnt] = me;
612     if (ix+1 <= idim - 1) {
613       ncnt++;
614       ixn[ncnt] = ix + 1;
615       iyn[ncnt] = iy;
616       izn[ncnt] = iz;
617       if (ix+1 > hi[0]) {
618         jdx = kp*pdi*pdj + jp*pdi + ip + 1;
619         il = 0;
620         jl = iy - lo[1];
621         kl = iz - lo[2];
622         ldpi = xld[ip+1];
623       } else {
624         jdx = me;
625         il = ix - lo[0] + 1;
626         jl = iy - lo[1];
627         kl = iz - lo[2];
628         ldpi = ldi;
629       }
630       idx = kl*ldpi*ldj + jl*ldpi + il;
631       nghbrs[ncnt] = idx;
632       procid[ncnt] = jdx;
633     }
634     if (ix-1 >= 0) {
635       ncnt++;
636       ixn[ncnt] = ix - 1;
637       iyn[ncnt] = iy;
638       izn[ncnt] = iz;
639       if (ix-1 < lo[0]) {
640         jdx = kp*pdi*pdj + jp*pdi + ip - 1;
641         il = xld[ip-1] - 1;
642         jl = iy - lo[1];
643         kl = iz - lo[2];
644         ldmi = xld[ip-1];
645       } else {
646         jdx = me;
647         il = ix - lo[0] - 1;
648         jl = iy - lo[1];
649         kl = iz - lo[2];
650         ldmi = ldi;
651       }
652       idx = kl*ldmi*ldj + jl*ldmi + il;
653       nghbrs[ncnt] = idx;
654       procid[ncnt] = jdx;
655     }
656     if (iy+1 <= jdim-1) {
657       ncnt++;
658       ixn[ncnt] = ix;
659       iyn[ncnt] = iy + 1;
660       izn[ncnt] = iz;
661       if (iy+1 > hi[1]) {
662         jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
663         il = ix - lo[0];
664         jl = 0;
665         kl = iz - lo[2];
666         ldpj = yld[jp+1];
667       } else {
668         jdx = me;
669         il = ix - lo[0];
670         jl = iy - lo[1] + 1;
671         kl = iz - lo[2];
672         ldpj = ldj;
673       }
674       idx = kl*ldi*ldpj + jl*ldi + il;
675       nghbrs[ncnt] = idx;
676       procid[ncnt] = jdx;
677     }
678     if (iy-1 >= 0) {
679       ncnt++;
680       ixn[ncnt] = ix;
681       iyn[ncnt] = iy - 1;
682       izn[ncnt] = iz;
683       if (iy-1 < lo[1]) {
684         jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
685         il = ix - lo[0];
686         jl = yld[jp-1] - 1;
687         kl = iz - lo[2];
688         ldmj = yld[jp-1];
689       } else {
690         jdx = me;
691         il = ix - lo[0];
692         jl = iy - lo[1] - 1;
693         kl = iz - lo[2];
694         ldmj = ldj;
695       }
696       idx = kl*ldi*ldmj + jl*ldi + il;
697       nghbrs[ncnt] = idx;
698       procid[ncnt] = jdx;
699     }
700     if (iz+1 <= kdim-1) {
701       ncnt++;
702       ixn[ncnt] = ix;
703       iyn[ncnt] = iy;
704       izn[ncnt] = iz + 1;
705       if (iz+1 > hi[2]) {
706         jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
707         il = ix - lo[0];
708         jl = iy - lo[1];
709         kl = 0;
710       } else {
711         jdx = me;
712         il = ix - lo[0];
713         jl = iy - lo[1];
714         kl = iz - lo[2] + 1;
715       }
716       idx = kl*ldi*ldj + jl*ldi + il;
717       nghbrs[ncnt] = idx;
718       procid[ncnt] = jdx;
719     }
720     if (iz-1 >= 0) {
721       ncnt++;
722       ixn[ncnt] = ix;
723       iyn[ncnt] = iy;
724       izn[ncnt] = iz - 1;
725       if (iz-1 < lo[2]) {
726         jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
727         il = ix - lo[0];
728         jl = iy - lo[1];
729         kl = zld[kp-1] - 1;
730       } else {
731         jdx = me;
732         il = ix - lo[0];
733         jl = iy - lo[1];
734         kl = iz - lo[2] - 1;
735       }
736       idx = kl*ldi*ldj + jl*ldi + il;
737       nghbrs[ncnt] = idx;
738       procid[ncnt] = jdx;
739     }
740     /*
741     sort indices so that neighbors run from lowest to highest local index. This sort
742     is not particularly efficient but ncnt is generally small
743      */
744     ncnt++;
745     for (j=0; j<ncnt; j++) {
746       for (k=j+1; k<ncnt; k++) {
747         if (nghbrs[j] > nghbrs[k]) {
748           itmp = nghbrs[j];
749           nghbrs[j] = nghbrs[k];
750           nghbrs[k] = itmp;
751           itmp = ixn[j];
752           ixn[j] = ixn[k];
753           ixn[k] = itmp;
754           itmp = iyn[j];
755           iyn[j] = iyn[k];
756           iyn[k] = itmp;
757           itmp = izn[j];
758           izn[j] = izn[k];
759           izn[k] = itmp;
760           itmp = procid[j];
761           procid[j] = procid[k];
762           procid[k] = itmp;
763         }
764       }
765     }
766     for (k=0; k<ncnt; k++) {
767       if (nghbrs[k] < 0 || nghbrs[k] >= ntot) {
768         printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]);
769       }
770     }
771 
772 /* set weights corresponding to a finite difference Laplacian on a 7-point
773    stencil */
774 
775     for (j=0; j<ncnt; j++) {
776       jdx = procid[j];
777       idx = lproc_inv[jdx];
778       if (ix == ixn[j] && iy == iyn[j] && iz == izn[j]) {
779         rval[lvoffset[idx]+licnt[idx]] = 6.0;
780       } else {
781         rval[lvoffset[idx]+licnt[idx]] = -1.0;
782       }
783       if (lvoffset[idx]+licnt[idx] < 0 || lvoffset[idx]+licnt[idx] >= nsave) {
784         printf("p[%d] Out of bounds (lvoffset+licnt)[%d]: %d\n",me,idx,lvoffset[idx]+licnt[idx]);
785       }
786       if (lvoffset[idx]+licnt[idx]>=idbg) {
787       }
788       /* TODO: Check this carefully */
789       jval[lvoffset[idx]+licnt[idx]] = nghbrs[j];
790       ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1;
791       licnt[idx]++;
792     }
793   }
794 
795 /* finish evaluating ival array */
796 
797   for (i=0; i<ltotal_procs; i++) {
798     ival[i*isize] = lvoffset[i];
799     for (j=1; j<isize; j++) {
800       ival[i*isize+j] = ival[i*isize+j-1] + ivalt[i*isize+j-1];
801     }
802   }
803   isize = 0;
804   for (i=0; i<ltotal_procs; i++) {
805     isize = isize + licnt[i];
806   }
807   if (isize > MAXVEC)
808     NGA_Error("ISIZE exceeds MAXVEC in local arrays ",isize);
809 
810 /* Local portion of sparse matrix has been evaluated and decomposed into blocks
811    that match partitioning of right hand side across processors. The following
812    data is available at this point:
813       1) ltotal_procs: the number of processors that are coupled to this one via
814          the sparse matrix
815       2) lproclist(ltotal_procs): a list of processor IDs that are coupled to
816          this processor
817       3) lproc_inv(nprocs): The entry in proc_list that corresponds to a given
818          processor. If the entry is -1 then that processor does not couple to
819          this processor.
820       4) licnt(ltotal_procs): The number of non-zero entries in the sparse matrix
821          that couple the process represented by proc_list(j) to this process
822       5) lvoffset(ltotal_procs): The offsets for the non-zero data in the arrays
823          rval and jval for the blocks that couple this processor to other
824          processes in proc_list
825       6) offset(nprocs): the offset array for the distributed right hand side
826          vector
827     These arrays describe how the sparse matrix is layed out both locally and
828     across processors. In addition, the actual data for the distributed sparse
829     matrix is found in the following arrays:
830       1) rval: values of matrix for all blocks on this processor
831       2) jval: j-indices of matrix for all blocks on this processor
832       3) ival(ltotal_procs*(lnsize(me)+1)): starting index in rval and
833          jval for each row in each block */
834 
835   NGA_Sync();
836 
837 /* Create a sparse array of sparse blocks.
838    Each block element is divided into for sections.
839    The first section consists of 7 ints and contains the parameters
840      imin: minimum i index represented by block
841      imin: maximum i index represented by block
842      jmin: minimum j index represented by block
843      jmin: maximum j index represented by block
844      iblock: row index of block
845      jblock: column index of block
846      nnz: number of non-zero elements in block
847    The next section consists of nnz doubles that represent the non-zero values
848    in the block. The third section consists of nnz ints and contains the local
849    j indices of all values. The final section consists of (imax-imin+2) ints
850    and contains the starting index in jval and rval for the each row between
851    imin and imax. An extra value is included at the end and is set equal to
852    nnz+1. This is included to simplify some coding.
853  */
854 
855   offset = 0;
856   for (i=0; i<me; i++) {
857     offset += nnz_list[i];
858   }
859   NGA_Put(*g_i, &me, &me, &offset, &one);
860   if (me==nprocs-1) {
861     NGA_Put(*g_i, &nprocs, &nprocs, &nnz, &one);
862   }
863   NGA_Sync();
864   for (i = 0; i<ltotal_procs; i++) {
865     /* evaluate total size of block */
866     b_nnz = ecnt[lproclist[i]];
867     isize = 7*sizeof(int) + b_nnz*(sizeof(double)+sizeof(int))
868           + (imax-imin+2)*sizeof(int);
869     blk_ptr = (int*)GP_Malloc(isize);
870 
871     iparams = blk_ptr;
872     gp_rval = (double*)(iparams+7);
873     gp_jval = (int*)(gp_rval+b_nnz);
874     gp_ival = (gp_jval+b_nnz);
875 
876     iparams[0] = imin;
877     iparams[1] = imax;
878     iparams[2] = jmin[lproclist[i]];
879     iparams[3] = jmax[lproclist[i]];
880     iparams[4] = me;
881     iparams[5] = lproclist[i];
882     iparams[6] = b_nnz;
883 
884     ldj = (imax-imin+2);
885     k = 0;
886     toffset = lvoffset[i];
887     for (j=0; j<b_nnz; j++) {
888       gp_jval[j] = jval[toffset+j];
889       gp_rval[j] = rval[toffset+j];
890     }
891 
892     toffset = ival[i*ldj];
893     for (k=0; k<ldj; k++) {
894       gp_ival[k] = ival[i*ldj+k]-toffset;
895     }
896 
897     /* Assign blk_ptr to GP array element */
898     GP_Assign_local_element(*gp_block, &offset, (void*)blk_ptr, isize);
899     j = 1;
900     NGA_Put(*g_j,&offset,&offset,&lproclist[i],&j);
901     offset++;
902   }
903   NGA_Sync();
904 
905   tmapc = (int*)malloc(nprocs*sizeof(int));
906   tmapc[0] = 0;
907   for (i=1; i<nprocs; i++) {
908     tmapc[i] = tmapc[i-1] + lnsize[i-1];
909   }
910     i = nprocs-1;
911   *imapc = tmapc;
912 
913   free(rval);
914   free(jval);
915   free(ival);
916   free(ivalt);
917   free(xld);
918   free(yld);
919   free(zld);
920   free(lnsize);
921   free(lvoffset);
922   free(ecnt);
923   free(licnt);
924   free(lproclist);
925   free(lproc_inv);
926   free(jmin);
927   free(jmax);
928   free(nnz_list);
929   return;
930 }
931 
main(int argc,char ** argv)932 int main(int argc, char **argv) {
933   int nmax, nprocs, me, me_plus;
934   int g_a_data, g_a_i, g_a_j, isize;
935   int gt_a_data, gt_a_i, gt_a_j;
936   int g_b, g_c;
937   int i, j, jj, k, one, jcnt;
938   int chunk, kp1, ld;
939   int *p_i, *p_j;
940   double *p_data, *p_b, *p_c;
941   double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
942   double t_hypre_strct, t_ga_trans, t_gp_get;
943   double t_get_blk_csr, t_trans_blk_csr, t_trans_blk, t_create_csr_ga, t_beg3;
944   double t_gp_tget, t_gp_malloc, t_gp_assign, t_beg4;
945   double prdot, dotga, dothypre, tempc;
946   double prtot, gatot, hypretot, gatot2, hypretot2;
947   double prdot2, prtot2;
948   int status;
949   int idim, jdim, kdim, idum, memsize;
950   int lsize, ntot;
951   int heap=200000, fudge=100, stack=200000, ma_heap;
952   double *cbuf, *vector;
953   int pdi, pdj, pdk, ip, jp, kp, ncells;
954   int lo[3],hi[3];
955   int blo[3], bhi[3];
956   int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
957   int iproc, iblock, btot;
958   double *amat, *bvec;
959   int *ivec, *jvec;
960   int *proclist, *proc_inv, *icnt;
961   int *voffset, *offset, *mapc;
962   int iloop, lo_bl, hi_bl;
963   char *buf, **buf_ptr;
964   int *iparams, *jval, *ival;
965   double *rval, *rvalt;
966   int imin, imax, jmin, jmax, irow, icol, nnz;
967   int nrows, kmin, kmax, lmin, lmax, jdx;
968   int LOOPNUM = 100;
969   void **blk_ptr;
970   void *blk;
971   int blk_size, tsize, zero;
972   int *iblk, *jblk, *blkidx;
973   int *tblk_ptr;
974   int *ivalt, *jvalt, *iparamst;
975   int *iblk_t, *jblk_t, *blkidx_t;
976 /*
977    Hypre declarations
978 */
979   int ierr;
980 #if USE_HYPRE
981   HYPRE_StructGrid grid;
982   HYPRE_StructStencil stencil;
983   HYPRE_StructMatrix matrix;
984   HYPRE_StructVector vec_x, vec_y;
985   int i4, j4, ndim, nelems, offsets[7][3];
986   int stencil_indices[7], hlo[3], hhi[3];
987   double weights[7];
988   double *values;
989   double alpha, beta;
990   int *rows, *cols;
991 #endif
992 /*
993   ***  Intitialize a message passing library
994 */
995   zero = 0;
996   one = 1;
997   ierr = MPI_Init(&argc, &argv);
998 /*
999  ***  Initialize GA
1000 
1001       There are 2 choices: ga_initialize or ga_initialize_ltd.
1002       In the first case, there is no explicit limit on memory usage.
1003       In the second, user can set limit (per processor) in bytes.
1004 */
1005   t_beg = GA_Wtime();
1006   NGA_Initialize();
1007   GP_Initialize();
1008   t_ga_in = GA_Wtime() - t_beg;
1009   NGA_Dgop(&t_ga_in,one,"+");
1010 
1011   t_ga_tot = 0.0;
1012   t_ga_trans = 0.0;
1013   t_get_blk_csr = 0.0;
1014   t_create_csr_ga = 0.0;
1015   t_trans_blk_csr = 0.0;
1016   t_trans_blk = 0.0;
1017   t_gp_get = 0.0;
1018   t_gp_malloc = 0.0;
1019   t_gp_assign = 0.0;
1020   t_mult = 0.0;
1021   t_get = 0.0;
1022   t_gp_tget = 0.0;
1023   t_hypre_strct = 0.0;
1024   prtot = 0.0;
1025   prtot2 = 0.0;
1026   gatot = 0.0;
1027   hypretot = 0.0;
1028 
1029   me = NGA_Nodeid();
1030   me_plus = me + 1;
1031   nprocs = NGA_Nnodes();
1032   if (me == 0) {
1033    printf("Time to initialize GA:                                 %12.4f\n",
1034           t_ga_in/((double)nprocs));
1035   }
1036 /*
1037      we can also use GA_set_memory_limit BEFORE first ga_create call
1038 */
1039   ma_heap = heap + fudge;
1040 /*      call GA_set_memory_limit(util_mdtob(ma_heap)) */
1041 
1042   if (me == 0) {
1043     printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
1044   }
1045 /*
1046  ***  Initialize the MA package
1047       MA must be initialized before any global array is allocated
1048 */
1049   if (!MA_init(MT_DBL, stack, ma_heap)) NGA_Error("ma_init failed",-1);
1050 /*
1051      create a sparse LMAX x LMAX matrix and two vectors of length
1052      LMAX. The matrix is stored in compressed row format.
1053      One of the vectors is filled with random data and the other
1054      is filled with zeros.
1055 */
1056   idim = IMAX;
1057   jdim = JMAX;
1058   kdim = KMAX;
1059   ntot = idim*jdim*kdim;
1060   if (me == 0) {
1061     printf("\nDimension of matrix: %d\n\n",ntot);
1062   }
1063   t_beg = GA_Wtime();
1064   grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
1065   if (me == 0) {
1066     printf("\nProcessor grid configuration\n");
1067     printf("  PDX: %d\n",pdi);
1068     printf("  PDY: %d\n",pdj);
1069     printf("  PDZ: %d\n\n",pdk);
1070     printf(" Number of Loops: %d\n",LOOPNUM);
1071   }
1072 
1073   create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,&g_a_data,&g_a_j,&g_a_i,&mapc);
1074   t_cnstrct = GA_Wtime() - t_beg;
1075 
1076   g_b = NGA_Create_handle();
1077   NGA_Set_data(g_b,one,&ntot,MT_DBL);
1078   NGA_Set_irreg_distr(g_b,mapc,&nprocs);
1079   status = NGA_Allocate(g_b);
1080 /*
1081     fill g_b with random values
1082 */
1083   NGA_Distribution(g_b,me,blo,bhi);
1084   NGA_Access(g_b,blo,bhi,&p_b,&ld);
1085   ld = bhi[0]-blo[0]+1;
1086   btot = ld;
1087   vector = (double*)malloc(ld*sizeof(double));
1088   for (i=0; i<ld; i++) {
1089     idum  = 0;
1090     p_b[i] = ran3(&idum);
1091     vector[i] = p_b[i];
1092   }
1093   NGA_Release(g_b,blo,bhi);
1094   NGA_Sync();
1095 
1096   g_c = NGA_Create_handle();
1097   NGA_Set_data(g_c,one,&ntot,MT_DBL);
1098   NGA_Set_irreg_distr(g_c,mapc,&nprocs);
1099   status = NGA_Allocate(g_c);
1100   NGA_Zero(g_c);
1101 #if USE_HYPRE
1102 /*
1103     Assemble HYPRE grid and use that to create matrix. Start by creating
1104     grid partition
1105 */
1106   ndim = 3;
1107   i = me;
1108   ip = i%pdi;
1109   i = (i-ip)/pdi;
1110   jp = i%pdj;
1111   kp = (i-jp)/pdj;
1112   lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
1113   if (ip < pdi-1) {
1114     hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
1115   } else {
1116     hi[0] = idim - 1;
1117   }
1118   lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
1119   if (jp < pdj-1) {
1120     hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
1121   } else {
1122     hi[1] = jdim - 1;
1123   }
1124   lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
1125   if (kp < pdk-1) {
1126     hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
1127   } else {
1128     hi[2] = kdim - 1;
1129   }
1130 /*
1131    Create grid
1132 */
1133   hlo[0] = lo[0];
1134   hlo[1] = lo[1];
1135   hlo[2] = lo[2];
1136   hhi[0] = hi[0];
1137   hhi[1] = hi[1];
1138   hhi[2] = hi[2];
1139   ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
1140   ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
1141   ierr = HYPRE_StructGridAssemble(grid);
1142 /*
1143    Create stencil
1144 */
1145   offsets[0][0] = 0;
1146   offsets[0][1] = 0;
1147   offsets[0][2] = 0;
1148 
1149   offsets[1][0] = 1;
1150   offsets[1][1] = 0;
1151   offsets[1][2] = 0;
1152 
1153   offsets[2][0] = 0;
1154   offsets[2][1] = 1;
1155   offsets[2][2] = 0;
1156 
1157   offsets[3][0] = 0;
1158   offsets[3][1] = 0;
1159   offsets[3][2] = 1;
1160 
1161   offsets[4][0] = -1;
1162   offsets[4][1] = 0;
1163   offsets[4][2] = 0;
1164 
1165   offsets[5][0] = 0;
1166   offsets[5][1] = -1;
1167   offsets[5][2] = 0;
1168 
1169   offsets[6][0] = 0;
1170   offsets[6][1] = 0;
1171   offsets[6][2] = -1;
1172 
1173   nelems = 7;
1174   ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
1175   for (i=0; i<nelems; i++) {
1176     ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
1177   }
1178 
1179   ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1180   jcnt = 7*ncells;
1181   values = (double*)malloc(jcnt*sizeof(double));
1182   jcnt = 0;
1183   weights[0] = 6.0;
1184   weights[1] = -1.0;
1185   weights[2] = -1.0;
1186   weights[3] = -1.0;
1187   weights[4] = -1.0;
1188   weights[5] = -1.0;
1189   weights[6] = -1.0;
1190   for (i=0; i<ncells; i++) {
1191     for (j=0; j<7; j++) {
1192       values[jcnt] = weights[j];
1193       jcnt++;
1194     }
1195   }
1196 
1197   ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
1198   ierr = HYPRE_StructMatrixInitialize(matrix);
1199   for (i=0; i<7; i++) {
1200     stencil_indices[i] = i;
1201   }
1202   ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
1203   free(values);
1204 /*
1205    Check all six sides of current box to see if any are boundaries.
1206    Set values to zero if they are.
1207 */
1208   if (hi[0] == idim-1) {
1209     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1210     hlo[0] = idim-1;
1211     hhi[0] = idim-1;
1212     hlo[1] = lo[1];
1213     hhi[1] = hi[1];
1214     hlo[2] = lo[2];
1215     hhi[2] = hi[2];
1216     values = (double*)malloc(ncells*sizeof(double));
1217     for (i=0; i<ncells; i++) values[i] = 0.0;
1218     i4 = 1;
1219     j4 = 1;
1220     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1221     free(values);
1222   }
1223   if (hi[1] == jdim-1) {
1224     ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1225     hlo[0] = lo[0];
1226     hhi[0] = hi[0];
1227     hlo[1] = jdim-1;
1228     hhi[1] = jdim-1;
1229     hlo[2] = lo[2];
1230     hhi[2] = hi[2];
1231     values = (double*)malloc(ncells*sizeof(double));
1232     for (i=0; i<ncells; i++) values[i] = 0.0;
1233     i4 = 1;
1234     j4 = 2;
1235     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1236     free(values);
1237   }
1238   if (hi[2] == kdim-1) {
1239     ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
1240     hlo[0] = lo[0];
1241     hhi[0] = hi[0];
1242     hlo[1] = lo[1];
1243     hhi[1] = hi[1];
1244     hlo[2] = kdim-1;
1245     hhi[2] = kdim-1;
1246     values = (double*)malloc(ncells*sizeof(double));
1247     for (i=0; i<ncells; i++) values[i] = 0.0;
1248     i4 = 1;
1249     j4 = 3;
1250     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1251     free(values);
1252   }
1253   if (lo[0] == 0) {
1254     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1255     hlo[0] = 0;
1256     hhi[0] = 0;
1257     hlo[1] = lo[1];
1258     hhi[1] = hi[1];
1259     hlo[2] = lo[2];
1260     hhi[2] = hi[2];
1261     values = (double*)malloc(ncells*sizeof(double));
1262     for (i=0; i<ncells; i++) values[i] = 0.0;
1263     i4 = 1;
1264     j4 = 4;
1265     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1266     free(values);
1267   }
1268   if (lo[1] == 0) {
1269     ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
1270     hlo[0] = lo[0];
1271     hhi[0] = hi[0];
1272     hlo[1] = 0;
1273     hhi[1] = 0;
1274     hlo[2] = lo[2];
1275     hhi[2] = hi[2];
1276     values = (double*)malloc(ncells*sizeof(double));
1277     for (i=0; i<ncells; i++) values[i] = 0.0;
1278     i4 = 1;
1279     j4 = 5;
1280     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1281     free(values);
1282   }
1283   if (lo[2] == 1) {
1284     ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1285     hlo[0] = lo[0];
1286     hhi[0] = hi[0];
1287     hlo[1] = lo[1];
1288     hhi[1] = hi[1];
1289     hlo[2] = 0;
1290     hhi[2] = 0;
1291     values = (double*)malloc(ncells*sizeof(double));
1292     for (i=0; i<ncells; i++) values[i] = 0.0;
1293     i4 = 1;
1294     j4 = 6;
1295     ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
1296     free(values);
1297   }
1298   ierr = HYPRE_StructMatrixAssemble(matrix);
1299 /*
1300     Create vectors for matrix-vector multiply
1301 */
1302   ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
1303   ierr = HYPRE_StructVectorInitialize(vec_x);
1304   hlo[0] = lo[0];
1305   hlo[1] = lo[1];
1306   hlo[2] = lo[2];
1307   hhi[0] = hi[0];
1308   hhi[1] = hi[1];
1309   hhi[2] = hi[2];
1310   ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
1311   ierr = HYPRE_StructVectorAssemble(vec_x);
1312   NGA_Distribution(g_a_i,me,blo,bhi);
1313 
1314   if (bhi[1] > ntot-1) {
1315     bhi[1] = ntot-1;
1316   }
1317 
1318   btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
1319 
1320   for (i=0; i<btot; i++) vector[i] = 0.0;
1321   hlo[0] = lo[0];
1322   hlo[1] = lo[1];
1323   hlo[2] = lo[2];
1324   hhi[0] = hi[0];
1325   hhi[1] = hi[1];
1326   hhi[2] = hi[2];
1327   ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);
1328 
1329   for (i=0; i<btot; i++) vector[i] = 0.0;
1330   ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
1331   ierr = HYPRE_StructVectorInitialize(vec_y);
1332   ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
1333   ierr = HYPRE_StructVectorAssemble(vec_y);
1334 #endif
1335 /* Multiply sparse matrix. Start by accessing pointers to local portions of
1336    g_a_data, g_a_j, g_a_i */
1337 
1338   NGA_Sync();
1339   for (iloop=0; iloop<LOOPNUM; iloop++) {
1340     t_beg2 = GA_Wtime();
1341 
1342     NGA_Distribution(g_c,me,blo,bhi);
1343     NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
1344     for (i = 0; i<bhi[0]-blo[0]+1; i++) {
1345       p_c[i] = 0.0;
1346     }
1347 
1348 /* get number of matrix blocks coupled to this process */
1349     NGA_Get(g_a_i,&me,&me,&lo_bl,&one);
1350 #if 1
1351     NGA_Get(g_a_i,&me_plus,&me_plus,&hi_bl,&one);
1352     hi_bl--;
1353     total_procs = hi_bl - lo_bl + 1;
1354     blk_ptr = (void**)malloc(sizeof(void*));
1355 /* Loop through matrix blocks */
1356     ioff = 0;
1357     for (iblock = 0; iblock<total_procs; iblock++) {
1358       t_beg = GA_Wtime();
1359       jdx = lo_bl+iblock;
1360 #if 0
1361       GP_Access_element(g_a_data, &jdx, &blk_ptr[0], &isize);
1362 #endif
1363 #if 1
1364       GP_Get_size(g_a_data, &jdx, &jdx, &isize);
1365 #endif
1366       blk = (void*)malloc(isize);
1367 #if 1
1368       GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0);
1369 #endif
1370       t_gp_get = t_gp_get + GA_Wtime() - t_beg;
1371       iparams = (int*)blk_ptr[0];
1372       rval = (double*)(iparams+7);
1373       imin = iparams[0];
1374       imax = iparams[1];
1375       jmin = iparams[2];
1376       jmax = iparams[3];
1377       irow = iparams[4];
1378       icol = iparams[5];
1379       nnz = iparams[6];
1380       jval = (int*)(rval+nnz);
1381       ival = (int*)(jval+nnz);
1382       nrows = imax - imin + 1;
1383       bvec = (double*)malloc((jmax-jmin+1)*sizeof(double));
1384       j = 0;
1385       t_beg = GA_Wtime();
1386       NGA_Get(g_b,&jmin,&jmax,bvec,&j);
1387       t_get = t_get + GA_Wtime() - t_beg;
1388       t_beg = GA_Wtime();
1389       for (i=0; i<nrows; i++) {
1390         kmin = ival[i];
1391         kmax = ival[i+1]-1;
1392         tempc = 0.0;
1393         for (j = kmin; j<=kmax; j++) {
1394           jj = jval[j];
1395           tempc = tempc + rval[j]*bvec[jj];
1396         }
1397         p_c[i] = p_c[i] + tempc;
1398       }
1399       t_mult = t_mult + GA_Wtime() - t_beg;
1400       free(bvec);
1401       free(blk);
1402     }
1403     NGA_Sync();
1404     t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;
1405 
1406     NGA_Distribution(g_c,me,blo,bhi);
1407     NGA_Release(g_c,blo,bhi);
1408 
1409 #if USE_HYPRE
1410     alpha = 1.0;
1411     beta = 0.0;
1412     t_beg = GA_Wtime();
1413     ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1414     t_hypre_strct = t_hypre_strct + GA_Wtime() - t_beg;
1415     hlo[0] = lo[0];
1416     hlo[1] = lo[1];
1417     hlo[2] = lo[2];
1418     hhi[0] = hi[0];
1419     hhi[1] = hi[1];
1420     hhi[2] = hi[2];
1421     ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1422     NGA_Distribution(g_c,me,hlo,hhi);
1423     cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
1424     NGA_Get(g_c,hlo,hhi,cbuf,&one);
1425     prdot = 0.0;
1426     dotga = 0.0;
1427     dothypre = 0.0;
1428     for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
1429       dothypre = dothypre + vector[i]*vector[i];
1430       dotga = dotga + cbuf[i]*cbuf[i];
1431       prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1432     }
1433     NGA_Dgop(&dotga,1,"+");
1434     NGA_Dgop(&dothypre,1,"+");
1435     NGA_Dgop(&prdot,1,"+");
1436     gatot += sqrt(dotga);
1437     hypretot += sqrt(dothypre);
1438     prtot += sqrt(prdot);
1439     free(cbuf);
1440 #endif
1441 
1442 /* Transpose matrix. Start by making local copies of ival and jval arrays for
1443    the sparse matrix of blocks stored in the GP array */
1444 #if 1
1445     t_beg2 = GA_Wtime();
1446     t_beg3 = GA_Wtime();
1447     iblk = (int*)malloc((nprocs+1)*sizeof(int));
1448     iblk_t = (int*)malloc((nprocs+1)*sizeof(int));
1449 #if 0
1450     NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
1451 #else
1452     if (me == 0) {
1453       NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
1454     } else {
1455       for (i=0; i<nprocs+1; i++) {
1456         iblk[i] = 0;
1457       }
1458     }
1459     GA_Igop(iblk,nprocs+1,"+");
1460 #endif
1461     jblk = (int*)malloc(iblk[nprocs]*sizeof(int));
1462     jblk_t = (int*)malloc(iblk[nprocs]*sizeof(int));
1463     iblock = iblk[nprocs]-1;
1464 #if 0
1465     NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
1466 #else
1467     if (me == 0) {
1468       NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
1469     } else {
1470       for (i=0; i<iblock+1; i++) {
1471         jblk[i] = 0;
1472       }
1473     }
1474     GA_Igop(jblk,iblock+1,"+");
1475 #endif
1476     iblock++;
1477     blkidx = (int*)malloc(iblk[nprocs]*sizeof(int));
1478     blkidx_t = (int*)malloc(iblk[nprocs]*sizeof(int));
1479     for (i=0; i<iblock; i++) {
1480       blkidx[i] = i;
1481     }
1482     iblock = nprocs;
1483     t_get_blk_csr = t_get_blk_csr + GA_Wtime() - t_beg3;
1484     t_beg3 = GA_Wtime();
1485     stran(iblock, iblock, iblk, jblk, blkidx, iblk_t, jblk_t, blkidx_t);
1486     t_trans_blk_csr = t_trans_blk_csr + GA_Wtime() - t_beg3;
1487     t_beg3 = GA_Wtime();
1488     gt_a_data = GP_Create_handle();
1489     i = iblk_t[nprocs];
1490     GP_Set_dimensions(gt_a_data, one, &i);
1491     GP_Set_irreg_distr(gt_a_data, iblk_t, &nprocs);
1492     GP_Allocate(gt_a_data);
1493 
1494     gt_a_j = NGA_Create_handle();
1495     i = iblk_t[nprocs];
1496     NGA_Set_data(gt_a_j, one, &i, C_INT);
1497     NGA_Set_irreg_distr(gt_a_j, iblk_t, &nprocs);
1498     NGA_Allocate(gt_a_j);
1499 
1500     gt_a_i = NGA_Create_handle();
1501     i = nprocs+1;
1502     NGA_Set_data(gt_a_i,one,&i,C_INT);
1503     for (i=0; i<nprocs; i++) mapc[i] = i;
1504     NGA_Set_irreg_distr(gt_a_i, mapc, &nprocs);
1505     NGA_Allocate(gt_a_i);
1506 
1507     /* copy i and j arrays of transposed matrix into distributed arrays */
1508     if (me==0) {
1509       lo_bl = 0;
1510       hi_bl = nprocs;
1511       NGA_Put(gt_a_i,&lo_bl,&hi_bl,iblk_t,&one);
1512       lo_bl = 0;
1513       hi_bl = iblk_t[nprocs]-1;
1514       NGA_Put(gt_a_j,&lo_bl,&hi_bl,jblk_t,&one);
1515     }
1516     NGA_Sync();
1517     lo_bl = iblk[me];
1518     hi_bl = iblk[me+1];
1519     total_procs = hi_bl - lo_bl + 1;
1520     total_procs = hi_bl - lo_bl;
1521     t_create_csr_ga = t_create_csr_ga + GA_Wtime() - t_beg3;
1522     for (iblock = lo_bl; iblock < hi_bl; iblock++) {
1523       t_beg4 = GA_Wtime();
1524       jdx = blkidx_t[iblock];
1525       GP_Get_size(g_a_data, &jdx, &jdx, &isize);
1526       blk = (void*)malloc(isize);
1527       GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0);
1528       /* Parameters for original block */
1529       iparams = (int*)blk_ptr[0];
1530       rval = (double*)(iparams+7);
1531       imin = iparams[0];
1532       imax = iparams[1];
1533       jmin = iparams[2];
1534       jmax = iparams[3];
1535       irow = iparams[4];
1536       icol = iparams[5];
1537       nnz = iparams[6];
1538       jval = (int*)(rval+nnz);
1539       ival = (int*)(jval+nnz);
1540 
1541       /* Create transposed block */
1542       isize = 7*sizeof(int) + nnz*(sizeof(double)+sizeof(int))
1543             + (jmax-jmin+2)*sizeof(int);
1544       t_gp_tget = t_gp_tget + GA_Wtime() - t_beg4;
1545       t_beg4 = GA_Wtime();
1546       tblk_ptr = (int*)GP_Malloc(isize);
1547       t_gp_malloc = t_gp_malloc + GA_Wtime() - t_beg4;
1548       t_beg3 = GA_Wtime();
1549       iparamst = (int*)tblk_ptr;
1550       rvalt = (double*)(iparamst+7);
1551       jvalt = (int*)(rvalt+nnz);
1552       ivalt = (int*)(jvalt+nnz);
1553       iparamst[0] = jmin;
1554       iparamst[1] = jmax;
1555       iparamst[2] = imin;
1556       iparamst[3] = imax;
1557       iparamst[4] = icol;
1558       iparamst[5] = irow;
1559       iparamst[6] = nnz;
1560       i = imax-imin+1;
1561       j = jmax-jmin+1;
1562       stranr(i, j, ival, jval, rval, ivalt, jvalt, rvalt);
1563       t_trans_blk = t_trans_blk + GA_Wtime() - t_beg3;
1564       t_beg4 = GA_Wtime();
1565       GP_Assign_local_element(gt_a_data, &iblock, (void*)tblk_ptr, isize);
1566       t_gp_assign = t_gp_assign + GA_Wtime() - t_beg4;
1567 #if 1
1568       free(blk);
1569 #endif
1570     }
1571 
1572     /* Clean up after transpose */
1573 #if 1
1574     free(iblk);
1575     free(iblk_t);
1576     free(jblk);
1577     free(jblk_t);
1578     free(blkidx);
1579     free(blkidx_t);
1580 #endif
1581     NGA_Sync();
1582     t_ga_trans = t_ga_trans + GA_Wtime() - t_beg2;
1583 #if USE_HYPRE
1584     alpha = 1.0;
1585     beta = 0.0;
1586     ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
1587     hlo[0] = lo[0];
1588     hlo[1] = lo[1];
1589     hlo[2] = lo[2];
1590     hhi[0] = hi[0];
1591     hhi[1] = hi[1];
1592     hhi[2] = hi[2];
1593     ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
1594     NGA_Distribution(g_c,me,hlo,hhi);
1595     cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
1596     NGA_Get(g_c,hlo,hhi,cbuf,&one);
1597     dothypre = 0.0;
1598     dotga = 0.0;
1599     prdot2 = 0.0;
1600     for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
1601       dothypre = dothypre + vector[i]*vector[i];
1602       dotga = dotga + cbuf[i]*cbuf[i];
1603       if (fabs(vector[i]-cbuf[i]) > 1.0e-10) {
1604         printf("p[%d] i: %d vector: %f cbuf: %f\n",me,i,vector[i],cbuf[i]);
1605       }
1606       prdot2 = prdot2 + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
1607     }
1608     NGA_Dgop(&dotga,1,"+");
1609     NGA_Dgop(&dothypre,1,"+");
1610     NGA_Dgop(&prdot2,1,"+");
1611     prtot2 += sqrt(prdot2);
1612     gatot2 += sqrt(dotga);
1613     hypretot2 += sqrt(dothypre);
1614     free(cbuf);
1615     free(blk_ptr);
1616 #endif
1617     /* Clean up transposed matrix */
1618     GP_Distribution(gt_a_data,me,blo,bhi);
1619     for (i=blo[0]; i<bhi[0]; i++) {
1620       GP_Free(GP_Free_local_element(gt_a_data,&i));
1621     }
1622     GP_Destroy(gt_a_data);
1623     NGA_Destroy(gt_a_i);
1624     NGA_Destroy(gt_a_j);
1625 #endif
1626 #endif
1627   }
1628   free(vector);
1629 #if USE_HYPRE
1630   if (me == 0) {
1631     printf("Magnitude of GA solution:                         %e\n",
1632         gatot/((double)LOOPNUM));
1633     printf("Magnitude of HYPRE solution:                      %e\n",
1634         hypretot/((double)LOOPNUM));
1635     printf("Magnitude of GA solution(2):                      %e\n",
1636         gatot2/((double)LOOPNUM));
1637     printf("Magnitude of HYPRE solution(2):                   %e\n",
1638         hypretot2/((double)LOOPNUM));
1639     printf("Difference between GA and HYPRE (Struct) results: %e\n",
1640         prtot/((double)LOOPNUM));
1641     printf("Difference between transpose and HYPRE results:   %e\n",
1642         prtot2/((double)LOOPNUM));
1643   }
1644 #endif
1645 
1646 /*
1647    Clean up arrays
1648 */
1649   NGA_Destroy(g_b);
1650   NGA_Destroy(g_c);
1651   GP_Distribution(g_a_data,me,blo,bhi);
1652   for (i=blo[0]; i<bhi[0]; i++) {
1653     GP_Free(GP_Free_local_element(g_a_data,&i));
1654   }
1655   GP_Destroy(g_a_data);
1656   NGA_Destroy(g_a_i);
1657   NGA_Destroy(g_a_j);
1658 #if USE_HYPRE
1659   ierr = HYPRE_StructStencilDestroy(stencil);
1660   ierr = HYPRE_StructGridDestroy(grid);
1661   ierr = HYPRE_StructMatrixDestroy(matrix);
1662   ierr = HYPRE_StructVectorDestroy(vec_x);
1663   ierr = HYPRE_StructVectorDestroy(vec_y);
1664 #endif
1665 
1666   NGA_Dgop(&t_cnstrct,1,"+");
1667   NGA_Dgop(&t_get,1,"+");
1668   NGA_Dgop(&t_gp_get,1,"+");
1669   NGA_Dgop(&t_mult,1,"+");
1670   NGA_Dgop(&t_ga_tot,1,"+");
1671   NGA_Dgop(&t_ga_trans,1,"+");
1672   NGA_Dgop(&t_get_blk_csr,1,"+");
1673   NGA_Dgop(&t_trans_blk_csr,1,"+");
1674   NGA_Dgop(&t_trans_blk,1,"+");
1675   NGA_Dgop(&t_create_csr_ga,1,"+");
1676   NGA_Dgop(&t_gp_tget,1,"+");
1677   NGA_Dgop(&t_gp_malloc,1,"+");
1678   NGA_Dgop(&t_gp_assign,1,"+");
1679 #if USE_HYPRE
1680   NGA_Dgop(&t_hypre_strct,1,"+");
1681 #endif
1682   free(mapc);
1683 
1684   if (me == 0) {
1685     printf("Time to create sparse matrix:                         %12.4f\n",
1686       t_cnstrct/((double)(nprocs*LOOPNUM)));
1687     printf("Time to get right hand side vector:                   %12.4f\n",
1688       t_get/((double)(nprocs*LOOPNUM)));
1689     printf("Time to get GP blocks:                                %12.4f\n",
1690       t_gp_get/((double)(nprocs*LOOPNUM)));
1691     printf("Time for sparse matrix block multiplication:          %12.4f\n",
1692       t_mult/((double)(nprocs*LOOPNUM)));
1693     printf("Time for total sparse matrix multiplication:          %12.4f\n",
1694       t_ga_tot/((double)(nprocs*LOOPNUM)));
1695 #if USE_HYPRE
1696     printf("Total time for HYPRE (Struct)  matrix-vector multiply:%12.4f\n",
1697       t_hypre_strct/((double)(nprocs*LOOPNUM)));
1698 #endif
1699     printf("Time to get block CSR distribution:                   %12.4f\n",
1700       t_get_blk_csr/((double)(nprocs*LOOPNUM)));
1701     printf("Time for transposing block CSR distribution:          %12.4f\n",
1702       t_trans_blk_csr/((double)(nprocs*LOOPNUM)));
1703     printf("Time for creating transposed block CSR GA:            %12.4f\n",
1704       t_create_csr_ga/((double)(nprocs*LOOPNUM)));
1705     printf("Time for transposing blocks:                          %12.4f\n",
1706       t_trans_blk/((double)(nprocs*LOOPNUM)));
1707     printf("Time to get GP blocks for transpose:                  %12.4f\n",
1708       t_gp_tget/((double)(nprocs*LOOPNUM)));
1709     printf("Time to malloc GP blocks for transpose:               %12.4f\n",
1710       t_gp_malloc/((double)(nprocs*LOOPNUM)));
1711     printf("Time to assign GP blocks for transpose:               %12.4f\n",
1712       t_gp_assign/((double)(nprocs*LOOPNUM)));
1713     printf("Time for total sparse matrix transpose:               %12.4f\n",
1714       t_ga_trans/((double)(nprocs*LOOPNUM)));
1715   }
1716   if (me==0) {
1717     printf("Terminating GA library\n");
1718   }
1719   NGA_Terminate();
1720 /*
1721  ***  Tidy up after message-passing library
1722  */
1723   ierr = MPI_Finalize();
1724 }
1725