#include #include #include #include "macdecls.h" #include "ga.h" #include "mp3.h" #define USE_HYPRE 0 #define IMAX 200 #define JMAX 200 #define KMAX 200 #define LMAX IMAX*JMAX*KMAX #define bb_a(ib) bb_v(bb_i + (ib)) #define cc_a(ib) cc_v(cc_i + (ib)) #define Integer int #define MAX_FACTOR 10000 /** * If this test is built standalone and then linked to the Hypre library, * the Hypre sparse matrix-vector multiply routines can be used to check the * correctness of the answers */ #if USE_HYPRE #include "HYPRE.h" #include "HYPRE_struct_mv.h" #include "mpi.h" #endif void grid_factor(int p, int xdim, int ydim, int zdim, int *idx, int *idy, int *idz) { int i, j; int ip, ifac, pmax, prime[MAX_FACTOR]; int fac[MAX_FACTOR]; int ix, iy, iz, ichk; i = 1; /** * factor p completely * first, find all prime numbers, besides 1, less than or equal to * the square root of p */ ip = (int)(sqrt((double)p))+1; pmax = 0; for (i=2; i<=ip; i++) { ichk = 1; for (j=0; j MAX_FACTOR) printf("Overflow in grid_factor\n"); prime[pmax-1] = i; } } /** * find all prime factors of p */ ip = p; ifac = 0; for (i=0; i= 0; i--) { ix = xdim/(*idx); iy = ydim/(*idy); iz = zdim/(*idz); if (ix >= iy && ix >= iz && ix > 1) { *idx = fac[i]*(*idx); } else if (iy >= ix && iy >= iz && iy > 1) { *idy = fac[i]*(*idy); } else if (iz >= ix && iz >= iy && iz > 1) { *idz = fac[i]*(*idz); } else { printf("Too many processors in grid factoring routine\n"); } } } /** * Short subroutine for multiplying sparse matrix block with vector segment */ void loc_matmul(double *a_mat, int *jvec, int *ivec, double *bvec, double *cvec, int nrows) { double tempc; int i, j, jj, jmin,jmax; for (i=0; i hi[0]) { jdx = kp*pdi*pdj + jp*pdi + ip + 1; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } if (ix-1 >= 0) { if (ix-1 < lo[0]) { jdx = kp*pdi*pdj + jp*pdi + ip - 1; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } if (iy+1 <= jdim-1) { if (iy+1 > hi[1]) { jdx = kp*pdi*pdj + (jp+1)*pdi + ip; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } if (iy-1 >= 0) { if (iy-1 < lo[1]) { jdx = kp*pdi*pdj + (jp-1)*pdi + ip; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } if (iz+1 <= kdim-1) { if (iz+1 > hi[2]) { jdx = (kp+1)*pdi*pdj + jp*pdi + ip; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } if (iz-1 >= 0) { if (iz-1 < lo[2]) { jdx = (kp-1)*pdi*pdj + jp*pdi + ip; ecnt[jdx] = ecnt[jdx] + 1; } else { ecnt[me] = ecnt[me] + 1; } } } /** * Create a list of processors that this processor is coupled to. Also count * the total number of grid points that this processor couples to. This number * is equal to the total number of matrix elements held by this processor. */ ltotal_procs = 0; ncnt = 0; for (i=0; i 0) { ltotal_procs += 1; ncnt += ecnt[i]; } } *total_procs = ltotal_procs; lproclist = (int*)malloc(ltotal_procs*sizeof(int)); *proclist = lproclist; lproc_inv = (int*)malloc(nprocs*sizeof(int)); *proc_inv = lproc_inv; licnt = (int*)malloc(ltotal_procs*sizeof(int)); *icnt = licnt; /** * Set up conventional CSR arrays to hold portion of matrix owned by this * processor */ rval = (double*)malloc(ncnt*sizeof(double)); idbg = ncnt; jval = (int*)malloc(ncnt*sizeof(int)); ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int)); ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int)); for (i=0; i 0) { lproclist[j] = i; if (j > 0) { lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1]; } lproc_inv[i] = j; j++; } } free(ecnt); isize = imax-imin+2; for (i=0; i hi[0]) { jdx = kp*pdi*pdj + jp*pdi + ip + 1; il = 0; jl = iy - lo[1]; kl = iz - lo[2]; ldpi = xld[ip+1]; } else { jdx = me; il = ix - lo[0] + 1; jl = iy - lo[1]; kl = iz - lo[2]; ldpi = ldi; } idx = kl*ldpi*ldj + jl*ldpi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } if (ix-1 >= 0) { ncnt++; ixn[ncnt] = ix - 1; iyn[ncnt] = iy; izn[ncnt] = iz; if (ix-1 < lo[0]) { jdx = kp*pdi*pdj + jp*pdi + ip - 1; il = xld[ip-1] - 1; jl = iy - lo[1]; kl = iz - lo[2]; ldmi = xld[ip-1]; } else { jdx = me; il = ix - lo[0] - 1; jl = iy - lo[1]; kl = iz - lo[2]; ldmi = ldi; } idx = kl*ldmi*ldj + jl*ldmi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } if (iy+1 <= jdim-1) { ncnt++; ixn[ncnt] = ix; iyn[ncnt] = iy + 1; izn[ncnt] = iz; if (iy+1 > hi[1]) { jdx = kp*pdi*pdj + (jp+1)*pdi + ip; il = ix - lo[0]; jl = 0; kl = iz - lo[2]; ldpj = yld[jp+1]; } else { jdx = me; il = ix - lo[0]; jl = iy - lo[1] + 1; kl = iz - lo[2]; ldpj = ldj; } idx = kl*ldi*ldpj + jl*ldi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } if (iy-1 >= 0) { ncnt++; ixn[ncnt] = ix; iyn[ncnt] = iy - 1; izn[ncnt] = iz; if (iy-1 < lo[1]) { jdx = kp*pdi*pdj + (jp-1)*pdi + ip; il = ix - lo[0]; jl = yld[jp-1] - 1; kl = iz - lo[2]; ldmj = yld[jp-1]; } else { jdx = me; il = ix - lo[0]; jl = iy - lo[1] - 1; kl = iz - lo[2]; ldmj = ldj; } idx = kl*ldi*ldmj + jl*ldi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } if (iz+1 <= kdim-1) { ncnt++; ixn[ncnt] = ix; iyn[ncnt] = iy; izn[ncnt] = iz + 1; if (iz+1 > hi[2]) { jdx = (kp+1)*pdi*pdj + jp*pdi + ip; il = ix - lo[0]; jl = iy - lo[1]; kl = 0; } else { jdx = me; il = ix - lo[0]; jl = iy - lo[1]; kl = iz - lo[2] + 1; } idx = kl*ldi*ldj + jl*ldi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } if (iz-1 >= 0) { ncnt++; ixn[ncnt] = ix; iyn[ncnt] = iy; izn[ncnt] = iz - 1; if (iz-1 < lo[2]) { jdx = (kp-1)*pdi*pdj + jp*pdi + ip; il = ix - lo[0]; jl = iy - lo[1]; kl = zld[kp-1] - 1; } else { jdx = me; il = ix - lo[0]; jl = iy - lo[1]; kl = iz - lo[2] - 1; } idx = kl*ldi*ldj + jl*ldi + il; nghbrs[ncnt] = idx; procid[ncnt] = jdx; } /** * sort j indices. This uses a simple bubble sort, but ncnt should be small. A * more sophisticated approach could be taken if this is too time consuming. */ ncnt++; for (j=0; j nghbrs[k]) { itmp = nghbrs[j]; nghbrs[j] = nghbrs[k]; nghbrs[k] = itmp; itmp = ixn[j]; ixn[j] = ixn[k]; ixn[k] = itmp; itmp = iyn[j]; iyn[j] = iyn[k]; iyn[k] = itmp; itmp = izn[j]; izn[j] = izn[k]; izn[k] = itmp; itmp = procid[j]; procid[j] = procid[k]; procid[k] = itmp; } } } for (k=0; k= ntot) { printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]); } } /** * create array elements * for (j=0; j=idbg) { } /* TODO: Check this carefully */ jval[lvoffset[idx]+licnt[idx]] = nghbrs[j]; ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1; licnt[idx]++; } } for (i=0; i ntot-1) { bhi[1] = ntot-1; } btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); for (i=0; i