1 /*
2    Implements the various scatter operations on cuda vectors
3 */
4 
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 
8 #include <petscconf.h>
9 #include <petsc/private/vecimpl.h>          /*I "petscvec.h" I*/
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/cudavecimpl.h>
12 
13 #include <cuda_runtime.h>
14 
VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep,PetscInt fromStep,PetscInt * tslots,PetscInt * fslots,PetscCUDAIndices * ci)15 PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {
16 
17   PetscCUDAIndices           cci;
18   VecScatterCUDAIndices_StoS stos_scatter;
19   cudaError_t                err;
20   PetscInt                   *intVecGPU;
21   int                        device;
22   cudaDeviceProp             props;
23   PetscErrorCode             ierr;
24 
25   PetscFunctionBegin;
26   cci = new struct _p_PetscCUDAIndices;
27   stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;
28 
29   /* create the "from" indices */
30   stos_scatter->fslots = 0;
31   stos_scatter->fromFirst = 0;
32   stos_scatter->fromStep = 0;
33   if (n) {
34     if (fslots) {
35       /* allocate GPU memory for the to-slots */
36       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
37       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
39 
40       /* assign the pointer to the struct */
41       stos_scatter->fslots = intVecGPU;
42       stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
43     } else if (fromStep) {
44       stos_scatter->fromFirst = fromFirst;
45       stos_scatter->fromStep = fromStep;
46       stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
47     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
48   }
49 
50   /* create the "to" indices */
51   stos_scatter->tslots = 0;
52   stos_scatter->toFirst = 0;
53   stos_scatter->toStep = 0;
54   if (n) {
55     if (tslots) {
56       /* allocate GPU memory for the to-slots */
57       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
58       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
59       ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
60 
61       /* assign the pointer to the struct */
62       stos_scatter->tslots = intVecGPU;
63       stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
64     } else if (toStep) {
65       stos_scatter->toFirst = toFirst;
66       stos_scatter->toStep = toStep;
67       stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
68     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
69   }
70 
71   /* Scatter uses default stream as well.
72      Note: Here we have used a separate stream earlier.
73            However, without proper synchronization with the default stream, one will inevitably run into data races.
74            Please keep that in mind when trying to reintroduce streams for scatters.
75            Ultimately, it's better to have correct code/results at 90 percent of peak performance than to have incorrect code/results at 99 percent of peak performance. */
76   stos_scatter->stream = 0;
77 
78   /* the number of indices */
79   stos_scatter->n = n;
80 
81   /* get the maximum number of coresident thread blocks */
82   cudaGetDevice(&device);
83   cudaGetDeviceProperties(&props, device);
84   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
85   if (props.major>=3) {
86     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
87   } else {
88     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
89   }
90 
91   /* assign the indices */
92   cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
93   cci->scatterType = VEC_SCATTER_CUDA_STOS;
94   *ci = cci;
95   PetscFunctionReturn(0);
96 }
97 
VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt * sendIndices,PetscInt nr,PetscInt * recvIndices,PetscCUDAIndices * ci)98 PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
99 {
100   PetscCUDAIndices           cci;
101   VecScatterCUDAIndices_PtoP ptop_scatter;
102 
103   PetscFunctionBegin;
104   cci = new struct _p_PetscCUDAIndices;
105   ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;
106 
107   /* this calculation assumes that the input indices are sorted */
108   if (sendIndices) {
109     ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
110     ptop_scatter->sendLowestIndex = sendIndices[0];
111   } else {
112     ptop_scatter->ns = 0;
113     ptop_scatter->sendLowestIndex = 0;
114   }
115   if (recvIndices) {
116     ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
117     ptop_scatter->recvLowestIndex = recvIndices[0];
118   } else {
119     ptop_scatter->nr = 0;
120     ptop_scatter->recvLowestIndex = 0;
121   }
122 
123   /* assign indices */
124   cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
125   cci->scatterType = VEC_SCATTER_CUDA_PTOP;
126 
127   *ci = cci;
128   PetscFunctionReturn(0);
129 }
130 
VecScatterCUDAIndicesDestroy(PetscCUDAIndices * ci)131 PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
132 {
133   PetscFunctionBegin;
134   if (*ci) {
135     if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
136       delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
137       (*ci)->scatter = 0;
138     } else {
139       cudaError_t err;
140       VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
141       if (stos_scatter->fslots) {
142         err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
143         stos_scatter->fslots = 0;
144       }
145 
146       /* free the GPU memory for the to-slots */
147       if (stos_scatter->tslots) {
148         err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
149         stos_scatter->tslots = 0;
150       }
151 
152       /* free the stream variable */
153       if (stos_scatter->stream) {
154         err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
155         stos_scatter->stream = 0;
156       }
157       delete stos_scatter;
158       (*ci)->scatter = 0;
159     }
160     delete *ci;
161     *ci = 0;
162   }
163   PetscFunctionReturn(0);
164 }
165 
166 /* Insert operator */
167 class Insert {
168   public:
operator ()(PetscScalar a,PetscScalar b) const169     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
170       return a;
171     }
172 };
173 
174 /* Add operator */
175 class Add {
176   public:
operator ()(PetscScalar a,PetscScalar b) const177     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
178       return a+b;
179     }
180 };
181 
182 /* Add operator */
183 class Max {
184   public:
operator ()(PetscScalar a,PetscScalar b) const185     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
186       return PetscMax(PetscRealPart(a),PetscRealPart(b));
187     }
188 };
189 
190 /* Sequential general to sequential general GPU kernel */
191 template<class OPERATOR>
VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt * xind,const PetscScalar * x,PetscInt * yind,PetscScalar * y,OPERATOR OP)192 __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
193   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
194   const int grid_size = gridDim.x * blockDim.x;
195   for (int i = tidx; i < n; i += grid_size) {
196     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
197   }
198 }
199 
200 /* Sequential general to sequential strided GPU kernel */
201 template<class OPERATOR>
VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt * xind,const PetscScalar * x,PetscInt toFirst,PetscInt toStep,PetscScalar * y,OPERATOR OP)202 __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
203   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
204   const int grid_size = gridDim.x * blockDim.x;
205   for (int i = tidx; i < n; i += grid_size) {
206     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
207   }
208 }
209 
210 /* Sequential strided to sequential strided GPU kernel */
211 template<class OPERATOR>
VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar * x,PetscInt toFirst,PetscInt toStep,PetscScalar * y,OPERATOR OP)212 __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
213   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
214   const int grid_size = gridDim.x * blockDim.x;
215   for (int i = tidx; i < n; i += grid_size) {
216     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
217   }
218 }
219 
220 /* Sequential strided to sequential general GPU kernel */
221 template<class OPERATOR>
VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar * x,PetscInt * yind,PetscScalar * y,OPERATOR OP)222 __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
223   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
224   const int grid_size = gridDim.x * blockDim.x;
225   for (int i = tidx; i < n; i += grid_size) {
226     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
227   }
228 }
229 
230 template<class OPERATOR>
VecScatterCUDA_StoS_Dispatcher(const PetscScalar * xarray,PetscScalar * yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP)231 void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {
232 
233   PetscInt                   nBlocks=0,nThreads=128;
234   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
235 
236   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
237   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
238     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
239   }
240   dim3 block(nThreads,1,1);
241   dim3 grid(nBlocks,1,1);
242 
243   if (mode == SCATTER_FORWARD) {
244     if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
246     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
247       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
248     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
249       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
250     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
251       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
252     }
253   } else {
254     if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
256     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
257       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
258     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
259       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
260     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
261       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
262     }
263   }
264 }
265 
VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)266 PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
267 {
268   PetscErrorCode             ierr;
269   const PetscScalar          *xarray;
270   PetscScalar                *yarray;
271   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
272   cudaError_t                err;
273 
274   PetscFunctionBegin;
275   ierr = VecCUDAAllocateCheck(x);CHKERRQ(ierr);
276   ierr = VecCUDAAllocateCheck(y);CHKERRQ(ierr);
277   ierr = VecCUDAGetArrayRead(x,&xarray);CHKERRQ(ierr);
278   ierr = VecCUDAGetArray(y,&yarray);CHKERRQ(ierr);
279   if (stos_scatter->n) {
280     if (addv == INSERT_VALUES)
281       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
282     else if (addv == ADD_VALUES)
283       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
284     else if (addv == MAX_VALUES)
285       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287     err = cudaGetLastError();CHKERRCUDA(err);
288     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
289   }
290   ierr = VecCUDARestoreArrayRead(x,&xarray);CHKERRQ(ierr);
291   ierr = VecCUDARestoreArray(y,&yarray);CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294