1 /*
2 Implements the various scatter operations on cuda vectors
3 */
4
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7
8 #include <petscconf.h>
9 #include <petsc/private/vecimpl.h> /*I "petscvec.h" I*/
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/cudavecimpl.h>
12
13 #include <cuda_runtime.h>
14
VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep,PetscInt fromStep,PetscInt * tslots,PetscInt * fslots,PetscCUDAIndices * ci)15 PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {
16
17 PetscCUDAIndices cci;
18 VecScatterCUDAIndices_StoS stos_scatter;
19 cudaError_t err;
20 PetscInt *intVecGPU;
21 int device;
22 cudaDeviceProp props;
23 PetscErrorCode ierr;
24
25 PetscFunctionBegin;
26 cci = new struct _p_PetscCUDAIndices;
27 stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;
28
29 /* create the "from" indices */
30 stos_scatter->fslots = 0;
31 stos_scatter->fromFirst = 0;
32 stos_scatter->fromStep = 0;
33 if (n) {
34 if (fslots) {
35 /* allocate GPU memory for the to-slots */
36 err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
37 err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
38 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
39
40 /* assign the pointer to the struct */
41 stos_scatter->fslots = intVecGPU;
42 stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
43 } else if (fromStep) {
44 stos_scatter->fromFirst = fromFirst;
45 stos_scatter->fromStep = fromStep;
46 stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
47 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
48 }
49
50 /* create the "to" indices */
51 stos_scatter->tslots = 0;
52 stos_scatter->toFirst = 0;
53 stos_scatter->toStep = 0;
54 if (n) {
55 if (tslots) {
56 /* allocate GPU memory for the to-slots */
57 err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
58 err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
59 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
60
61 /* assign the pointer to the struct */
62 stos_scatter->tslots = intVecGPU;
63 stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
64 } else if (toStep) {
65 stos_scatter->toFirst = toFirst;
66 stos_scatter->toStep = toStep;
67 stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
68 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
69 }
70
71 /* Scatter uses default stream as well.
72 Note: Here we have used a separate stream earlier.
73 However, without proper synchronization with the default stream, one will inevitably run into data races.
74 Please keep that in mind when trying to reintroduce streams for scatters.
75 Ultimately, it's better to have correct code/results at 90 percent of peak performance than to have incorrect code/results at 99 percent of peak performance. */
76 stos_scatter->stream = 0;
77
78 /* the number of indices */
79 stos_scatter->n = n;
80
81 /* get the maximum number of coresident thread blocks */
82 cudaGetDevice(&device);
83 cudaGetDeviceProperties(&props, device);
84 stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
85 if (props.major>=3) {
86 stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
87 } else {
88 stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
89 }
90
91 /* assign the indices */
92 cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
93 cci->scatterType = VEC_SCATTER_CUDA_STOS;
94 *ci = cci;
95 PetscFunctionReturn(0);
96 }
97
VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt * sendIndices,PetscInt nr,PetscInt * recvIndices,PetscCUDAIndices * ci)98 PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
99 {
100 PetscCUDAIndices cci;
101 VecScatterCUDAIndices_PtoP ptop_scatter;
102
103 PetscFunctionBegin;
104 cci = new struct _p_PetscCUDAIndices;
105 ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;
106
107 /* this calculation assumes that the input indices are sorted */
108 if (sendIndices) {
109 ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
110 ptop_scatter->sendLowestIndex = sendIndices[0];
111 } else {
112 ptop_scatter->ns = 0;
113 ptop_scatter->sendLowestIndex = 0;
114 }
115 if (recvIndices) {
116 ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
117 ptop_scatter->recvLowestIndex = recvIndices[0];
118 } else {
119 ptop_scatter->nr = 0;
120 ptop_scatter->recvLowestIndex = 0;
121 }
122
123 /* assign indices */
124 cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
125 cci->scatterType = VEC_SCATTER_CUDA_PTOP;
126
127 *ci = cci;
128 PetscFunctionReturn(0);
129 }
130
VecScatterCUDAIndicesDestroy(PetscCUDAIndices * ci)131 PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
132 {
133 PetscFunctionBegin;
134 if (*ci) {
135 if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
136 delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
137 (*ci)->scatter = 0;
138 } else {
139 cudaError_t err;
140 VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
141 if (stos_scatter->fslots) {
142 err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
143 stos_scatter->fslots = 0;
144 }
145
146 /* free the GPU memory for the to-slots */
147 if (stos_scatter->tslots) {
148 err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
149 stos_scatter->tslots = 0;
150 }
151
152 /* free the stream variable */
153 if (stos_scatter->stream) {
154 err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
155 stos_scatter->stream = 0;
156 }
157 delete stos_scatter;
158 (*ci)->scatter = 0;
159 }
160 delete *ci;
161 *ci = 0;
162 }
163 PetscFunctionReturn(0);
164 }
165
166 /* Insert operator */
167 class Insert {
168 public:
operator ()(PetscScalar a,PetscScalar b) const169 __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
170 return a;
171 }
172 };
173
174 /* Add operator */
175 class Add {
176 public:
operator ()(PetscScalar a,PetscScalar b) const177 __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
178 return a+b;
179 }
180 };
181
182 /* Add operator */
183 class Max {
184 public:
operator ()(PetscScalar a,PetscScalar b) const185 __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
186 return PetscMax(PetscRealPart(a),PetscRealPart(b));
187 }
188 };
189
190 /* Sequential general to sequential general GPU kernel */
191 template<class OPERATOR>
VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt * xind,const PetscScalar * x,PetscInt * yind,PetscScalar * y,OPERATOR OP)192 __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
193 const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
194 const int grid_size = gridDim.x * blockDim.x;
195 for (int i = tidx; i < n; i += grid_size) {
196 y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
197 }
198 }
199
200 /* Sequential general to sequential strided GPU kernel */
201 template<class OPERATOR>
VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt * xind,const PetscScalar * x,PetscInt toFirst,PetscInt toStep,PetscScalar * y,OPERATOR OP)202 __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
203 const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
204 const int grid_size = gridDim.x * blockDim.x;
205 for (int i = tidx; i < n; i += grid_size) {
206 y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
207 }
208 }
209
210 /* Sequential strided to sequential strided GPU kernel */
211 template<class OPERATOR>
VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar * x,PetscInt toFirst,PetscInt toStep,PetscScalar * y,OPERATOR OP)212 __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
213 const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
214 const int grid_size = gridDim.x * blockDim.x;
215 for (int i = tidx; i < n; i += grid_size) {
216 y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
217 }
218 }
219
220 /* Sequential strided to sequential general GPU kernel */
221 template<class OPERATOR>
VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar * x,PetscInt * yind,PetscScalar * y,OPERATOR OP)222 __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
223 const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
224 const int grid_size = gridDim.x * blockDim.x;
225 for (int i = tidx; i < n; i += grid_size) {
226 y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
227 }
228 }
229
230 template<class OPERATOR>
VecScatterCUDA_StoS_Dispatcher(const PetscScalar * xarray,PetscScalar * yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP)231 void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {
232
233 PetscInt nBlocks=0,nThreads=128;
234 VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
235
236 nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
237 if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
238 nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
239 }
240 dim3 block(nThreads,1,1);
241 dim3 grid(nBlocks,1,1);
242
243 if (mode == SCATTER_FORWARD) {
244 if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245 VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
246 } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
247 VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
248 } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
249 VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
250 } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
251 VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
252 }
253 } else {
254 if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255 VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
256 } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
257 VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
258 } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
259 VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
260 } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
261 VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
262 }
263 }
264 }
265
VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)266 PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
267 {
268 PetscErrorCode ierr;
269 const PetscScalar *xarray;
270 PetscScalar *yarray;
271 VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
272 cudaError_t err;
273
274 PetscFunctionBegin;
275 ierr = VecCUDAAllocateCheck(x);CHKERRQ(ierr);
276 ierr = VecCUDAAllocateCheck(y);CHKERRQ(ierr);
277 ierr = VecCUDAGetArrayRead(x,&xarray);CHKERRQ(ierr);
278 ierr = VecCUDAGetArray(y,&yarray);CHKERRQ(ierr);
279 if (stos_scatter->n) {
280 if (addv == INSERT_VALUES)
281 VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
282 else if (addv == ADD_VALUES)
283 VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
284 else if (addv == MAX_VALUES)
285 VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287 err = cudaGetLastError();CHKERRCUDA(err);
288 err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
289 }
290 ierr = VecCUDARestoreArrayRead(x,&xarray);CHKERRQ(ierr);
291 ierr = VecCUDARestoreArray(y,&yarray);CHKERRQ(ierr);
292 PetscFunctionReturn(0);
293 }
294