1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <cuda_runtime.h>
3 #include <petsccublas.h> /* For CHKERRCUDA */
4
5 /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
MapTidToIndex(const PetscInt * opt,PetscInt tid)6 __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
7 {
8 PetscInt i,j,k,m,n,r;
9 const PetscInt *offset,*start,*dx,*dy,*X,*Y;
10
11 n = opt[0];
12 offset = opt + 1;
13 start = opt + n + 2;
14 dx = opt + 2*n + 2;
15 dy = opt + 3*n + 2;
16 X = opt + 5*n + 2;
17 Y = opt + 6*n + 2;
18 for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
19 m = (tid - offset[r]);
20 k = m/(dx[r]*dy[r]);
21 j = (m - k*dx[r]*dy[r])/dx[r];
22 i = m - k*dx[r]*dy[r] - j*dx[r];
23
24 return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
25 }
26
27 /*====================================================================================*/
28 /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
29 /*====================================================================================*/
30
31 /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
32 <Type> is PetscReal, which is the primitive type we operate on.
33 <bs> is 16, which says <unit> contains 16 primitive types.
34 <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
35 <EQ> is 0, which is (bs == BS ? 1 : 0)
36
37 If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
38 For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
39 */
40 template<class Type,PetscInt BS,PetscInt EQ>
d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,const Type * data,Type * buf)41 __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
42 {
43 PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
44 const PetscInt grid_size = gridDim.x * blockDim.x;
45 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
46 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
47
48 for (; tid<count; tid += grid_size) {
49 /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
50 opt == NULL && idx == NULL ==> the indices are contiguous;
51 */
52 t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
53 s = tid*MBS;
54 for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
55 }
56 }
57
58 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,Type * data,const Type * buf)59 __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
60 {
61 PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
62 const PetscInt grid_size = gridDim.x * blockDim.x;
63 const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
64 Op op;
65
66 for (; tid<count; tid += grid_size) {
67 t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
68 s = tid*MBS;
69 for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
70 }
71 }
72
73 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,Type * leafbuf)74 __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
75 {
76 PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
77 const PetscInt grid_size = gridDim.x * blockDim.x;
78 const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
79 Op op;
80
81 for (; tid<count; tid += grid_size) {
82 r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
83 l = tid*MBS;
84 for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
85 }
86 }
87
88 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt * srcIdx,const Type * src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt * dstIdx,Type * dst)89 __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
90 {
91 PetscInt i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
92 const PetscInt grid_size = gridDim.x * blockDim.x;
93 const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
94 Op op;
95
96 for (; tid<count; tid += grid_size) {
97 if (!srcIdx) { /* src is either contiguous or 3D */
98 k = tid/(srcx*srcy);
99 j = (tid - k*srcx*srcy)/srcx;
100 i = tid - k*srcx*srcy - j*srcx;
101 s = srcStart + k*srcX*srcY + j*srcX + i;
102 } else {
103 s = srcIdx[tid];
104 }
105
106 if (!dstIdx) { /* dst is either contiguous or 3D */
107 k = tid/(dstx*dsty);
108 j = (tid - k*dstx*dsty)/dstx;
109 i = tid - k*dstx*dsty - j*dstx;
110 t = dstStart + k*dstX*dstY + j*dstX + i;
111 } else {
112 t = dstIdx[tid];
113 }
114
115 s *= MBS;
116 t *= MBS;
117 for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
118 }
119 }
120
121 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,PetscInt leafstart,const PetscInt * leafopt,const PetscInt * leafidx,const Type * leafdata,Type * leafupdate)122 __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
123 {
124 PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
125 const PetscInt grid_size = gridDim.x * blockDim.x;
126 const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
127 Op op;
128
129 for (; tid<count; tid += grid_size) {
130 r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
131 l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
132 for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
133 }
134 }
135
136 /*====================================================================================*/
137 /* Regular operations on device */
138 /*====================================================================================*/
operator ()Insert139 template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = y; return old;}};
operator ()Add140 template<typename Type> struct Add {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y; return old;}};
operator ()Mult141 template<typename Type> struct Mult {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y; return old;}};
operator ()Min142 template<typename Type> struct Min {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMin(x,y); return old;}};
operator ()Max143 template<typename Type> struct Max {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMax(x,y); return old;}};
operator ()LAND144 template<typename Type> struct LAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x && y; return old;}};
operator ()LOR145 template<typename Type> struct LOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x || y; return old;}};
operator ()LXOR146 template<typename Type> struct LXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = !x != !y; return old;}};
operator ()BAND147 template<typename Type> struct BAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x & y; return old;}};
operator ()BOR148 template<typename Type> struct BOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x | y; return old;}};
operator ()BXOR149 template<typename Type> struct BXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x ^ y; return old;}};
150 template<typename Type> struct Minloc {
operator ()Minloc151 __device__ Type operator() (Type& x,Type y) const {
152 Type old = x;
153 if (y.a < x.a) x = y;
154 else if (y.a == x.a) x.b = min(x.b,y.b);
155 return old;
156 }
157 };
158 template<typename Type> struct Maxloc {
operator ()Maxloc159 __device__ Type operator() (Type& x,Type y) const {
160 Type old = x;
161 if (y.a > x.a) x = y;
162 else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
163 return old;
164 }
165 };
166
167 /*====================================================================================*/
168 /* Atomic operations on device */
169 /*====================================================================================*/
170
171 /*
172 Atomic Insert (exchange) operations
173
174 CUDA C Programming Guide V10.1 Chapter B.12.1.3:
175
176 int atomicExch(int* address, int val);
177 unsigned int atomicExch(unsigned int* address, unsigned int val);
178 unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
179 float atomicExch(float* address, float val);
180
181 reads the 32-bit or 64-bit word old located at the address address in global or shared
182 memory and stores val back to memory at the same address. These two operations are
183 performed in one atomic transaction. The function returns old.
184
185 PETSc notes:
186
187 It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.
188
189 VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
190 atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
191 be atomic itself.
192
193 With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
194 insertion is atomic. Hope no user codes rely on that.
195 */
atomicExch(double * address,double val)196 __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}
197
198 #if defined(PETSC_USE_64BIT_INDICES)
atomicExch(PetscInt * address,PetscInt val)199 __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
200 #endif
201
operator ()AtomicInsert202 template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
203
204 #if defined(PETSC_HAVE_COMPLEX)
205 #if defined(PETSC_USE_REAL_DOUBLE)
206 /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
207 template<> struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert208 __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
209 PetscComplex old, *z = &old;
210 double *xp = (double*)&x,*yp = (double*)&y;
211 AtomicInsert<double> op;
212 z[0] = op(xp[0],yp[0]);
213 z[1] = op(xp[1],yp[1]);
214 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
215 }
216 };
217 #elif defined(PETSC_USE_REAL_SINGLE)
218 template<> struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert219 __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
220 double *xp = (double*)&x,*yp = (double*)&y;
221 AtomicInsert<double> op;
222 return op(xp[0],yp[0]);
223 }
224 };
225 #endif
226 #endif
227
228 /*
229 Atomic add operations
230
231 CUDA C Programming Guide V10.1 Chapter B.12.1.1:
232
233 int atomicAdd(int* address, int val);
234 unsigned int atomicAdd(unsigned int* address,unsigned int val);
235 unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
236 float atomicAdd(float* address, float val);
237 double atomicAdd(double* address, double val);
238 __half2 atomicAdd(__half2 *address, __half2 val);
239 __half atomicAdd(__half *address, __half val);
240
241 reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
242 and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
243 function returns old.
244
245 The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
246 The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
247 The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
248 higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
249 the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
250 The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
251 */
252
253 #if defined(PETSC_USE_64BIT_INDICES)
atomicAdd(PetscInt * address,PetscInt val)254 __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
255 #endif
256
operator ()AtomicAdd257 template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
258
259 template<> struct AtomicAdd<double> {
operator ()AtomicAdd260 __device__ double operator() (double& x,double y) const {
261 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
262 return atomicAdd(&x,y);
263 #else
264 double *address = &x, val = y;
265 unsigned long long int *address_as_ull = (unsigned long long int*)address;
266 unsigned long long int old = *address_as_ull, assumed;
267 do {
268 assumed = old;
269 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
270 /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
271 } while (assumed != old);
272 return __longlong_as_double(old);
273 #endif
274 }
275 };
276
277 template<> struct AtomicAdd<float> {
operator ()AtomicAdd278 __device__ float operator() (float& x,float y) const {
279 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
280 return atomicAdd(&x,y);
281 #else
282 float *address = &x, val = y;
283 int *address_as_int = (int*)address;
284 int old = *address_as_int, assumed;
285 do {
286 assumed = old;
287 old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
288 /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
289 } while (assumed != old);
290 return __int_as_float(old);
291 #endif
292 }
293 };
294
295 #if defined(PETSC_HAVE_COMPLEX)
296 template<> struct AtomicAdd<PetscComplex> {
operator ()AtomicAdd297 __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
298 PetscComplex old, *z = &old;
299 PetscReal *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
300 AtomicAdd<PetscReal> op;
301 z[0] = op(xp[0],yp[0]);
302 z[1] = op(xp[1],yp[1]);
303 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
304 }
305 };
306 #endif
307
308 /*
309 Atomic Mult operations:
310
311 CUDA has no atomicMult at all, so we build our own with atomicCAS
312 */
313 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMult(double * address,double val)314 __device__ static double atomicMult(double* address, double val)
315 {
316 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
317 unsigned long long int old = *address_as_ull, assumed;
318 do {
319 assumed = old;
320 /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
321 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
322 } while (assumed != old);
323 return __longlong_as_double(old);
324 }
325 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMult(float * address,float val)326 __device__ static float atomicMult(float* address,float val)
327 {
328 int *address_as_int = (int*)(address);
329 int old = *address_as_int, assumed;
330 do {
331 assumed = old;
332 old = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
333 } while (assumed != old);
334 return __int_as_float(old);
335 }
336 #endif
337
atomicMult(int * address,int val)338 __device__ static int atomicMult(int* address,int val)
339 {
340 int *address_as_int = (int*)(address);
341 int old = *address_as_int, assumed;
342 do {
343 assumed = old;
344 old = atomicCAS(address_as_int, assumed, val*assumed);
345 } while (assumed != old);
346 return (int)old;
347 }
348
349 #if defined(PETSC_USE_64BIT_INDICES)
atomicMult(PetscInt * address,PetscInt val)350 __device__ static int atomicMult(PetscInt* address,PetscInt val)
351 {
352 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
353 unsigned long long int old = *address_as_ull, assumed;
354 do {
355 assumed = old;
356 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
357 } while (assumed != old);
358 return (PetscInt)old;
359 }
360 #endif
361
operator ()AtomicMult362 template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
363
364 /*
365 Atomic Min/Max operations
366
367 CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
368
369 int atomicMin(int* address, int val);
370 unsigned int atomicMin(unsigned int* address,unsigned int val);
371 unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
372
373 reads the 32-bit or 64-bit word old located at the address address in global or shared
374 memory, computes the minimum of old and val, and stores the result back to memory
375 at the same address. These three operations are performed in one atomic transaction.
376 The function returns old.
377 The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
378
379 atomicMax() is similar.
380 */
381
382 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMin(double * address,double val)383 __device__ static double atomicMin(double* address, double val)
384 {
385 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
386 unsigned long long int old = *address_as_ull, assumed;
387 do {
388 assumed = old;
389 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
390 } while (assumed != old);
391 return __longlong_as_double(old);
392 }
393
atomicMax(double * address,double val)394 __device__ static double atomicMax(double* address, double val)
395 {
396 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
397 unsigned long long int old = *address_as_ull, assumed;
398 do {
399 assumed = old;
400 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
401 } while (assumed != old);
402 return __longlong_as_double(old);
403 }
404 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMin(float * address,float val)405 __device__ static float atomicMin(float* address,float val)
406 {
407 int *address_as_int = (int*)(address);
408 int old = *address_as_int, assumed;
409 do {
410 assumed = old;
411 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
412 } while (assumed != old);
413 return __int_as_float(old);
414 }
415
atomicMax(float * address,float val)416 __device__ static float atomicMax(float* address,float val)
417 {
418 int *address_as_int = (int*)(address);
419 int old = *address_as_int, assumed;
420 do {
421 assumed = old;
422 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
423 } while (assumed != old);
424 return __int_as_float(old);
425 }
426 #endif
427
428 /*
429 atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
430 atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
431 This causes compilation errors with pgi compilers and 64-bit indices:
432 error: function "atomicMin(long long *, long long)" has already been defined
433
434 So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
435 */
436 #if defined(PETSC_USE_64BIT_INDICES) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
atomicMin(PetscInt * address,PetscInt val)437 __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
438 {
439 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
440 unsigned long long int old = *address_as_ull, assumed;
441 do {
442 assumed = old;
443 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
444 } while (assumed != old);
445 return (PetscInt)old;
446 }
447
atomicMax(PetscInt * address,PetscInt val)448 __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
449 {
450 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
451 unsigned long long int old = *address_as_ull, assumed;
452 do {
453 assumed = old;
454 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
455 } while (assumed != old);
456 return (PetscInt)old;
457 }
458 #endif
459
operator ()AtomicMin460 template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
operator ()AtomicMax461 template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
462
463 /*
464 Atomic bitwise operations
465
466 CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
467
468 int atomicAnd(int* address, int val);
469 unsigned int atomicAnd(unsigned int* address,unsigned int val);
470 unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
471
472 reads the 32-bit or 64-bit word old located at the address address in global or shared
473 memory, computes (old & val), and stores the result back to memory at the same
474 address. These three operations are performed in one atomic transaction.
475 The function returns old.
476
477 The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
478
479 atomicOr() and atomicXor are similar.
480 */
481
482 #if defined(PETSC_USE_64BIT_INDICES)
483 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin(PetscInt* address,PetscInt val) */
atomicAnd(PetscInt * address,PetscInt val)484 __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
485 {
486 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
487 unsigned long long int old = *address_as_ull, assumed;
488 do {
489 assumed = old;
490 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
491 } while (assumed != old);
492 return (PetscInt)old;
493 }
atomicOr(PetscInt * address,PetscInt val)494 __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
495 {
496 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
497 unsigned long long int old = *address_as_ull, assumed;
498 do {
499 assumed = old;
500 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
501 } while (assumed != old);
502 return (PetscInt)old;
503 }
504
atomicXor(PetscInt * address,PetscInt val)505 __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
506 {
507 unsigned long long int *address_as_ull = (unsigned long long int*)(address);
508 unsigned long long int old = *address_as_ull, assumed;
509 do {
510 assumed = old;
511 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
512 } while (assumed != old);
513 return (PetscInt)old;
514 }
515 #else
516 /*
517 See also comments at atomicMin(PetscInt* address,PetscInt val)
518 __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
519 __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
520 __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
521 */
522 #endif
523 #endif
524
operator ()AtomicBAND525 template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
operator ()AtomicBOR526 template<typename Type> struct AtomicBOR {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
operator ()AtomicBXOR527 template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
528
529 /*
530 Atomic logical operations:
531
532 CUDA has no atomic logical operations at all. We support them on integer types.
533 */
534
535 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
536 which is what we want since we only support 32-bit and 64-bit integers.
537 */
538 template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
539
540 template<typename Type,class Op>
541 struct AtomicLogical<Type,Op,4> {
operator ()AtomicLogical542 __device__ Type operator()(Type& x,Type y) const {
543 int *address_as_int = (int*)(&x);
544 int old = *address_as_int, assumed;
545 Op op;
546 do {
547 assumed = old;
548 old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
549 } while (assumed != old);
550 return (Type)old;
551 }
552 };
553
554 template<typename Type,class Op>
555 struct AtomicLogical<Type,Op,8> {
operator ()AtomicLogical556 __device__ Type operator()(Type& x,Type y) const {
557 unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
558 unsigned long long int old = *address_as_ull, assumed;
559 Op op;
560 do {
561 assumed = old;
562 old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
563 } while (assumed != old);
564 return (Type)old;
565 }
566 };
567
568 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
operator ()land569 template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
operator ()lor570 template<typename Type> struct lor {__device__ Type operator()(Type x, Type y) {return x || y;}};
operator ()lxor571 template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
572
operator ()AtomicLAND573 template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
operator ()AtomicLOR574 template<typename Type> struct AtomicLOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
operator ()AtomicLXOR575 template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
576
577 /*====================================================================================*/
578 /* Wrapper functions of cuda kernels. Function pointers are stored in 'link' */
579 /*====================================================================================*/
580 template<typename Type,PetscInt BS,PetscInt EQ>
Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,const void * data,void * buf)581 static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
582 {
583 cudaError_t cerr;
584 PetscInt nthreads=256;
585 PetscInt nblocks=(count+nthreads-1)/nthreads;
586 const PetscInt *iarray=opt ? opt->array : NULL;
587
588 PetscFunctionBegin;
589 if (!count) PetscFunctionReturn(0);
590 nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
591 d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
592 cerr = cudaGetLastError();CHKERRCUDA(cerr);
593 PetscFunctionReturn(0);
594 }
595
596 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,const void * buf)597 static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
598 {
599 cudaError_t cerr;
600 PetscInt nthreads=256;
601 PetscInt nblocks=(count+nthreads-1)/nthreads;
602 const PetscInt *iarray=opt ? opt->array : NULL;
603
604 PetscFunctionBegin;
605 if (!count) PetscFunctionReturn(0);
606 nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
607 d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
608 cerr = cudaGetLastError();CHKERRCUDA(cerr);
609 PetscFunctionReturn(0);
610 }
611
612 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,void * buf)613 static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
614 {
615 cudaError_t cerr;
616 PetscInt nthreads=256;
617 PetscInt nblocks=(count+nthreads-1)/nthreads;
618 const PetscInt *iarray=opt ? opt->array : NULL;
619
620 PetscFunctionBegin;
621 if (!count) PetscFunctionReturn(0);
622 nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
623 d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
624 cerr = cudaGetLastError();CHKERRCUDA(cerr);
625 PetscFunctionReturn(0);
626 }
627
628 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)629 static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
630 {
631 cudaError_t cerr;
632 PetscInt nthreads=256;
633 PetscInt nblocks=(count+nthreads-1)/nthreads;
634 PetscInt srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
635
636 PetscFunctionBegin;
637 if (!count) PetscFunctionReturn(0);
638 nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
639
640 /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
641 if (srcOpt) {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
642 else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
643
644 if (dstOpt) {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
645 else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
646
647 d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
648 cerr = cudaGetLastError();CHKERRCUDA(cerr);
649 PetscFunctionReturn(0);
650 }
651
652 /* Specialization for Insert since we may use cudaMemcpyAsync */
653 template<typename Type,PetscInt BS,PetscInt EQ>
ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)654 static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
655 {
656 PetscErrorCode ierr;
657 cudaError_t cerr;
658
659 PetscFunctionBegin;
660 if (!count) PetscFunctionReturn(0);
661 /*src and dst are contiguous */
662 if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
663 cerr = cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
664 } else {
665 ierr = ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);CHKERRQ(ierr);
666 }
667 PetscFunctionReturn(0);
668 }
669
670 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt * rootidx,void * rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt * leafidx,const void * leafdata,void * leafupdate)671 static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
672 {
673 cudaError_t cerr;
674 PetscInt nthreads=256;
675 PetscInt nblocks=(count+nthreads-1)/nthreads;
676 const PetscInt *rarray = rootopt ? rootopt->array : NULL;
677 const PetscInt *larray = leafopt ? leafopt->array : NULL;
678
679 PetscFunctionBegin;
680 if (!count) PetscFunctionReturn(0);
681 nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
682 d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
683 cerr = cudaGetLastError();CHKERRCUDA(cerr);
684 PetscFunctionReturn(0);
685 }
686
687 /*====================================================================================*/
688 /* Init various types and instantiate pack/unpack function pointers */
689 /*====================================================================================*/
690 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_RealType(PetscSFLink link)691 static void PackInit_RealType(PetscSFLink link)
692 {
693 /* Pack/unpack for remote communication */
694 link->d_Pack = Pack<Type,BS,EQ>;
695 link->d_UnpackAndInsert = UnpackAndOp <Type,Insert<Type> ,BS,EQ>;
696 link->d_UnpackAndAdd = UnpackAndOp <Type,Add<Type> ,BS,EQ>;
697 link->d_UnpackAndMult = UnpackAndOp <Type,Mult<Type> ,BS,EQ>;
698 link->d_UnpackAndMin = UnpackAndOp <Type,Min<Type> ,BS,EQ>;
699 link->d_UnpackAndMax = UnpackAndOp <Type,Max<Type> ,BS,EQ>;
700 link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
701
702 /* Scatter for local communication */
703 link->d_ScatterAndInsert = ScatterAndInsert<Type ,BS,EQ>; /* Has special optimizations */
704 link->d_ScatterAndAdd = ScatterAndOp <Type,Add<Type> ,BS,EQ>;
705 link->d_ScatterAndMult = ScatterAndOp <Type,Mult<Type> ,BS,EQ>;
706 link->d_ScatterAndMin = ScatterAndOp <Type,Min<Type> ,BS,EQ>;
707 link->d_ScatterAndMax = ScatterAndOp <Type,Max<Type> ,BS,EQ>;
708 link->d_FetchAndAddLocal = FetchAndOpLocal <Type,Add <Type> ,BS,EQ>;
709
710 /* Atomic versions when there are data-race possibilities */
711 link->da_UnpackAndInsert = UnpackAndOp <Type,AtomicInsert<Type>,BS,EQ>;
712 link->da_UnpackAndAdd = UnpackAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
713 link->da_UnpackAndMult = UnpackAndOp <Type,AtomicMult<Type> ,BS,EQ>;
714 link->da_UnpackAndMin = UnpackAndOp <Type,AtomicMin<Type> ,BS,EQ>;
715 link->da_UnpackAndMax = UnpackAndOp <Type,AtomicMax<Type> ,BS,EQ>;
716 link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
717
718 link->da_ScatterAndInsert = ScatterAndOp <Type,AtomicInsert<Type>,BS,EQ>;
719 link->da_ScatterAndAdd = ScatterAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
720 link->da_ScatterAndMult = ScatterAndOp <Type,AtomicMult<Type> ,BS,EQ>;
721 link->da_ScatterAndMin = ScatterAndOp <Type,AtomicMin<Type> ,BS,EQ>;
722 link->da_ScatterAndMax = ScatterAndOp <Type,AtomicMax<Type> ,BS,EQ>;
723 link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type> ,BS,EQ>;
724 }
725
726 /* Have this templated class to specialize for char integers */
727 template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
728 struct PackInit_IntegerType_Atomic {
InitPackInit_IntegerType_Atomic729 static void Init(PetscSFLink link) {
730 link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
731 link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
732 link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
733 link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
734 link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
735 link->da_UnpackAndLAND = UnpackAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
736 link->da_UnpackAndLOR = UnpackAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
737 link->da_UnpackAndLXOR = UnpackAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
738 link->da_UnpackAndBAND = UnpackAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
739 link->da_UnpackAndBOR = UnpackAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
740 link->da_UnpackAndBXOR = UnpackAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
741 link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
742
743 link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
744 link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
745 link->da_ScatterAndMult = ScatterAndOp<Type,AtomicMult<Type> ,BS,EQ>;
746 link->da_ScatterAndMin = ScatterAndOp<Type,AtomicMin<Type> ,BS,EQ>;
747 link->da_ScatterAndMax = ScatterAndOp<Type,AtomicMax<Type> ,BS,EQ>;
748 link->da_ScatterAndLAND = ScatterAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
749 link->da_ScatterAndLOR = ScatterAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
750 link->da_ScatterAndLXOR = ScatterAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
751 link->da_ScatterAndBAND = ScatterAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
752 link->da_ScatterAndBOR = ScatterAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
753 link->da_ScatterAndBXOR = ScatterAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
754 link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
755 }
756 };
757
758 /* CUDA does not support atomics on chars. It is TBD in PETSc. */
759 template<typename Type,PetscInt BS,PetscInt EQ>
760 struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
InitPackInit_IntegerType_Atomic761 static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
762 };
763
764 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_IntegerType(PetscSFLink link)765 static void PackInit_IntegerType(PetscSFLink link)
766 {
767 link->d_Pack = Pack<Type,BS,EQ>;
768 link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
769 link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
770 link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
771 link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
772 link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
773 link->d_UnpackAndLAND = UnpackAndOp<Type,LAND<Type> ,BS,EQ>;
774 link->d_UnpackAndLOR = UnpackAndOp<Type,LOR<Type> ,BS,EQ>;
775 link->d_UnpackAndLXOR = UnpackAndOp<Type,LXOR<Type> ,BS,EQ>;
776 link->d_UnpackAndBAND = UnpackAndOp<Type,BAND<Type> ,BS,EQ>;
777 link->d_UnpackAndBOR = UnpackAndOp<Type,BOR<Type> ,BS,EQ>;
778 link->d_UnpackAndBXOR = UnpackAndOp<Type,BXOR<Type> ,BS,EQ>;
779 link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
780
781 link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
782 link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
783 link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
784 link->d_ScatterAndMin = ScatterAndOp<Type,Min<Type> ,BS,EQ>;
785 link->d_ScatterAndMax = ScatterAndOp<Type,Max<Type> ,BS,EQ>;
786 link->d_ScatterAndLAND = ScatterAndOp<Type,LAND<Type> ,BS,EQ>;
787 link->d_ScatterAndLOR = ScatterAndOp<Type,LOR<Type> ,BS,EQ>;
788 link->d_ScatterAndLXOR = ScatterAndOp<Type,LXOR<Type> ,BS,EQ>;
789 link->d_ScatterAndBAND = ScatterAndOp<Type,BAND<Type> ,BS,EQ>;
790 link->d_ScatterAndBOR = ScatterAndOp<Type,BOR<Type> ,BS,EQ>;
791 link->d_ScatterAndBXOR = ScatterAndOp<Type,BXOR<Type> ,BS,EQ>;
792 link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
793 PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
794 }
795
796 #if defined(PETSC_HAVE_COMPLEX)
797 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_ComplexType(PetscSFLink link)798 static void PackInit_ComplexType(PetscSFLink link)
799 {
800 link->d_Pack = Pack<Type,BS,EQ>;
801 link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
802 link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
803 link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
804 link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
805
806 link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
807 link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
808 link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
809 link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
810
811 link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
812 link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
813 link->da_UnpackAndMult = NULL; /* Not implemented yet */
814 link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
815
816 link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
817 link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
818 }
819 #endif
820
821 typedef signed char SignedChar;
822 typedef unsigned char UnsignedChar;
823 typedef struct {int a; int b; } PairInt;
824 typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
825
826 template<typename Type>
PackInit_PairType(PetscSFLink link)827 static void PackInit_PairType(PetscSFLink link)
828 {
829 link->d_Pack = Pack<Type,1,1>;
830 link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
831 link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
832 link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
833
834 link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
835 link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
836 link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
837 /* Atomics for pair types are not implemented yet */
838 }
839
840 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_DumbType(PetscSFLink link)841 static void PackInit_DumbType(PetscSFLink link)
842 {
843 link->d_Pack = Pack<Type,BS,EQ>;
844 link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
845 link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
846 /* Atomics for dumb types are not implemented yet */
847 }
848
849 /* Some device-specific utilities */
PetscSFLinkSyncDevice_Cuda(PetscSFLink link)850 static PetscErrorCode PetscSFLinkSyncDevice_Cuda(PetscSFLink link)
851 {
852 cudaError_t cerr;
853 PetscFunctionBegin;
854 cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
855 PetscFunctionReturn(0);
856 }
857
PetscSFLinkSyncStream_Cuda(PetscSFLink link)858 static PetscErrorCode PetscSFLinkSyncStream_Cuda(PetscSFLink link)
859 {
860 cudaError_t cerr;
861 PetscFunctionBegin;
862 cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
863 PetscFunctionReturn(0);
864 }
865
PetscSFLinkMemcpy_Cuda(PetscSFLink link,PetscMemType dstmtype,void * dst,PetscMemType srcmtype,const void * src,size_t n)866 static PetscErrorCode PetscSFLinkMemcpy_Cuda(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
867 {
868 PetscFunctionBegin;
869 enum cudaMemcpyKind kinds[2][2] = {{cudaMemcpyHostToHost,cudaMemcpyHostToDevice},{cudaMemcpyDeviceToHost,cudaMemcpyDeviceToDevice}};
870
871 if (n) {
872 if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
873 PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);
874 } else { /* Assume PETSC_MEMTYPE_HOST=0, PETSC_MEMTYPE_DEVICE=1 */
875 cudaError_t err = cudaMemcpyAsync(dst,src,n,kinds[srcmtype][dstmtype],link->stream);CHKERRCUDA(err);
876 }
877 }
878 PetscFunctionReturn(0);
879 }
880
PetscSFMalloc_Cuda(PetscMemType mtype,size_t size,void ** ptr)881 PetscErrorCode PetscSFMalloc_Cuda(PetscMemType mtype,size_t size,void** ptr)
882 {
883 PetscFunctionBegin;
884 if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMalloc(size,ptr);CHKERRQ(ierr);}
885 else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMalloc(ptr,size);CHKERRCUDA(err);}
886 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
887 PetscFunctionReturn(0);
888 }
889
PetscSFFree_Cuda(PetscMemType mtype,void * ptr)890 PetscErrorCode PetscSFFree_Cuda(PetscMemType mtype,void* ptr)
891 {
892 PetscFunctionBegin;
893 if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscFree(ptr);CHKERRQ(ierr);}
894 else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaFree(ptr);CHKERRCUDA(err);}
895 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
896 PetscFunctionReturn(0);
897 }
898
899 /*====================================================================================*/
900 /* Main driver to init MPI datatype on device */
901 /*====================================================================================*/
902
903 /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
PetscSFLinkSetUp_Cuda(PetscSF sf,PetscSFLink link,MPI_Datatype unit)904 PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
905 {
906 PetscErrorCode ierr;
907 cudaError_t err;
908 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
909 PetscBool is2Int,is2PetscInt;
910 #if defined(PETSC_HAVE_COMPLEX)
911 PetscInt nPetscComplex=0;
912 #endif
913
914 PetscFunctionBegin;
915 if (link->deviceinited) PetscFunctionReturn(0);
916 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr);
917 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
918 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
919 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr);
920 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
921 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
922 #if defined(PETSC_HAVE_COMPLEX)
923 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
924 #endif
925 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
926 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
927
928 if (is2Int) {
929 PackInit_PairType<PairInt>(link);
930 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
931 PackInit_PairType<PairPetscInt>(link);
932 } else if (nPetscReal) {
933 if (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
934 else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
935 else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
936 else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
937 } else if (nPetscInt) {
938 if (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
939 else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
940 else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
941 else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
942 #if defined(PETSC_USE_64BIT_INDICES)
943 } else if (nInt) {
944 if (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
945 else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
946 else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
947 else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
948 #endif
949 } else if (nSignedChar) {
950 if (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
951 else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
952 else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
953 else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
954 } else if (nUnsignedChar) {
955 if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
956 else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
957 else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
958 else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
959 #if defined(PETSC_HAVE_COMPLEX)
960 } else if (nPetscComplex) {
961 if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
962 else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
963 else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
964 else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
965 #endif
966 } else {
967 MPI_Aint lb,nbyte;
968 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
969 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
970 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
971 if (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
972 else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
973 else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
974 } else {
975 nInt = nbyte / sizeof(int);
976 if (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
977 else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
978 else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
979 else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
980 }
981 }
982
983 if (!sf->use_default_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
984 if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
985 int device;
986 struct cudaDeviceProp props;
987 err = cudaGetDevice(&device);CHKERRCUDA(err);
988 err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
989 sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
990 }
991 link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
992
993 link->d_SyncDevice = PetscSFLinkSyncDevice_Cuda;
994 link->d_SyncStream = PetscSFLinkSyncStream_Cuda;
995 link->Memcpy = PetscSFLinkMemcpy_Cuda;
996 link->deviceinited = PETSC_TRUE;
997 PetscFunctionReturn(0);
998 }
999