1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <cuda_runtime.h>
3 #include <petsccublas.h> /* For CHKERRCUDA */
4 
5 /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
MapTidToIndex(const PetscInt * opt,PetscInt tid)6 __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
7 {
8   PetscInt        i,j,k,m,n,r;
9   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;
10 
11   n      = opt[0];
12   offset = opt + 1;
13   start  = opt + n + 2;
14   dx     = opt + 2*n + 2;
15   dy     = opt + 3*n + 2;
16   X      = opt + 5*n + 2;
17   Y      = opt + 6*n + 2;
18   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
19   m = (tid - offset[r]);
20   k = m/(dx[r]*dy[r]);
21   j = (m - k*dx[r]*dy[r])/dx[r];
22   i = m - k*dx[r]*dy[r] - j*dx[r];
23 
24   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
25 }
26 
27 /*====================================================================================*/
28 /*  Templated CUDA kernels for pack/unpack. The Op can be regular or atomic           */
29 /*====================================================================================*/
30 
31 /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
32    <Type> is PetscReal, which is the primitive type we operate on.
33    <bs>   is 16, which says <unit> contains 16 primitive types.
34    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
35    <EQ>   is 0, which is (bs == BS ? 1 : 0)
36 
37   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
38   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
39 */
40 template<class Type,PetscInt BS,PetscInt EQ>
d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,const Type * data,Type * buf)41 __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
42 {
43   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
44   const PetscInt  grid_size = gridDim.x * blockDim.x;
45   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
46   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
47 
48   for (; tid<count; tid += grid_size) {
49     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
50        opt == NULL && idx == NULL ==> the indices are contiguous;
51      */
52     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
53     s = tid*MBS;
54     for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
55   }
56 }
57 
58 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,Type * data,const Type * buf)59 __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
60 {
61   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
62   const PetscInt  grid_size = gridDim.x * blockDim.x;
63   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
64   Op              op;
65 
66   for (; tid<count; tid += grid_size) {
67     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
68     s = tid*MBS;
69     for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
70   }
71 }
72 
73 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,Type * leafbuf)74 __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
75 {
76   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
77   const PetscInt  grid_size = gridDim.x * blockDim.x;
78   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
79   Op              op;
80 
81   for (; tid<count; tid += grid_size) {
82     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
83     l = tid*MBS;
84     for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
85   }
86 }
87 
88 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt * srcIdx,const Type * src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt * dstIdx,Type * dst)89 __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
90 {
91   PetscInt        i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
92   const PetscInt  grid_size = gridDim.x * blockDim.x;
93   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
94   Op              op;
95 
96   for (; tid<count; tid += grid_size) {
97     if (!srcIdx) { /* src is either contiguous or 3D */
98       k = tid/(srcx*srcy);
99       j = (tid - k*srcx*srcy)/srcx;
100       i = tid - k*srcx*srcy - j*srcx;
101       s = srcStart + k*srcX*srcY + j*srcX + i;
102     } else {
103       s = srcIdx[tid];
104     }
105 
106     if (!dstIdx) { /* dst is either contiguous or 3D */
107       k = tid/(dstx*dsty);
108       j = (tid - k*dstx*dsty)/dstx;
109       i = tid - k*dstx*dsty - j*dstx;
110       t = dstStart + k*dstX*dstY + j*dstX + i;
111     } else {
112       t = dstIdx[tid];
113     }
114 
115     s *= MBS;
116     t *= MBS;
117     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
118   }
119 }
120 
121 template<class Type,class Op,PetscInt BS,PetscInt EQ>
d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,PetscInt leafstart,const PetscInt * leafopt,const PetscInt * leafidx,const Type * leafdata,Type * leafupdate)122 __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
123 {
124   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
125   const PetscInt  grid_size = gridDim.x * blockDim.x;
126   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
127   Op              op;
128 
129   for (; tid<count; tid += grid_size) {
130     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
131     l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
132     for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
133   }
134 }
135 
136 /*====================================================================================*/
137 /*                             Regular operations on device                           */
138 /*====================================================================================*/
operator ()Insert139 template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
operator ()Add140 template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
operator ()Mult141 template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
operator ()Min142 template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
operator ()Max143 template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
operator ()LAND144 template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
operator ()LOR145 template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
operator ()LXOR146 template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
operator ()BAND147 template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
operator ()BOR148 template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
operator ()BXOR149 template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
150 template<typename Type> struct Minloc {
operator ()Minloc151   __device__ Type operator() (Type& x,Type y) const {
152     Type old = x;
153     if (y.a < x.a) x = y;
154     else if (y.a == x.a) x.b = min(x.b,y.b);
155     return old;
156   }
157 };
158 template<typename Type> struct Maxloc {
operator ()Maxloc159   __device__ Type operator() (Type& x,Type y) const {
160     Type old = x;
161     if (y.a > x.a) x = y;
162     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
163     return old;
164   }
165 };
166 
167 /*====================================================================================*/
168 /*                             Atomic operations on device                            */
169 /*====================================================================================*/
170 
171 /*
172   Atomic Insert (exchange) operations
173 
174   CUDA C Programming Guide V10.1 Chapter B.12.1.3:
175 
176   int atomicExch(int* address, int val);
177   unsigned int atomicExch(unsigned int* address, unsigned int val);
178   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
179   float atomicExch(float* address, float val);
180 
181   reads the 32-bit or 64-bit word old located at the address address in global or shared
182   memory and stores val back to memory at the same address. These two operations are
183   performed in one atomic transaction. The function returns old.
184 
185   PETSc notes:
186 
187   It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.
188 
189   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
190   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
191   be atomic itself.
192 
193   With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
194   insertion is atomic. Hope no user codes rely on that.
195 */
atomicExch(double * address,double val)196 __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}
197 
198 #if defined(PETSC_USE_64BIT_INDICES)
atomicExch(PetscInt * address,PetscInt val)199 __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
200 #endif
201 
operator ()AtomicInsert202 template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
203 
204 #if defined(PETSC_HAVE_COMPLEX)
205 #if defined(PETSC_USE_REAL_DOUBLE)
206 /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
207 template<> struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert208   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
209     PetscComplex         old, *z = &old;
210     double               *xp = (double*)&x,*yp = (double*)&y;
211     AtomicInsert<double> op;
212     z[0] = op(xp[0],yp[0]);
213     z[1] = op(xp[1],yp[1]);
214     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
215   }
216 };
217 #elif defined(PETSC_USE_REAL_SINGLE)
218 template<> struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert219   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
220     double               *xp = (double*)&x,*yp = (double*)&y;
221     AtomicInsert<double> op;
222     return op(xp[0],yp[0]);
223   }
224 };
225 #endif
226 #endif
227 
228 /*
229   Atomic add operations
230 
231   CUDA C Programming Guide V10.1 Chapter B.12.1.1:
232 
233   int atomicAdd(int* address, int val);
234   unsigned int atomicAdd(unsigned int* address,unsigned int val);
235   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
236   float atomicAdd(float* address, float val);
237   double atomicAdd(double* address, double val);
238   __half2 atomicAdd(__half2 *address, __half2 val);
239   __half atomicAdd(__half *address, __half val);
240 
241   reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
242   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
243   function returns old.
244 
245   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
246   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
247   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
248   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
249   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
250   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
251 */
252 
253 #if defined(PETSC_USE_64BIT_INDICES)
atomicAdd(PetscInt * address,PetscInt val)254 __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
255 #endif
256 
operator ()AtomicAdd257 template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
258 
259 template<> struct AtomicAdd<double> {
operator ()AtomicAdd260   __device__ double operator() (double& x,double y) const {
261 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
262     return atomicAdd(&x,y);
263 #else
264     double                 *address = &x, val = y;
265     unsigned long long int *address_as_ull = (unsigned long long int*)address;
266     unsigned long long int old = *address_as_ull, assumed;
267     do {
268       assumed = old;
269       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
270       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
271     } while (assumed != old);
272     return __longlong_as_double(old);
273 #endif
274   }
275 };
276 
277 template<> struct AtomicAdd<float> {
operator ()AtomicAdd278   __device__ float operator() (float& x,float y) const {
279 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
280     return atomicAdd(&x,y);
281 #else
282     float *address = &x, val = y;
283     int   *address_as_int = (int*)address;
284     int   old = *address_as_int, assumed;
285     do {
286       assumed = old;
287       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
288       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
289     } while (assumed != old);
290     return __int_as_float(old);
291 #endif
292   }
293 };
294 
295 #if defined(PETSC_HAVE_COMPLEX)
296 template<> struct AtomicAdd<PetscComplex> {
operator ()AtomicAdd297  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
298   PetscComplex         old, *z = &old;
299   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
300   AtomicAdd<PetscReal> op;
301   z[0] = op(xp[0],yp[0]);
302   z[1] = op(xp[1],yp[1]);
303   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
304  }
305 };
306 #endif
307 
308 /*
309   Atomic Mult operations:
310 
311   CUDA has no atomicMult at all, so we build our own with atomicCAS
312  */
313 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMult(double * address,double val)314 __device__ static double atomicMult(double* address, double val)
315 {
316   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
317   unsigned long long int old = *address_as_ull, assumed;
318   do {
319     assumed = old;
320     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
321     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
322   } while (assumed != old);
323   return __longlong_as_double(old);
324 }
325 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMult(float * address,float val)326 __device__ static float atomicMult(float* address,float val)
327 {
328   int *address_as_int = (int*)(address);
329   int old = *address_as_int, assumed;
330   do {
331     assumed  = old;
332     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
333   } while (assumed != old);
334   return __int_as_float(old);
335 }
336 #endif
337 
atomicMult(int * address,int val)338 __device__ static int atomicMult(int* address,int val)
339 {
340   int *address_as_int = (int*)(address);
341   int old = *address_as_int, assumed;
342   do {
343     assumed = old;
344     old     = atomicCAS(address_as_int, assumed, val*assumed);
345   } while (assumed != old);
346   return (int)old;
347 }
348 
349 #if defined(PETSC_USE_64BIT_INDICES)
atomicMult(PetscInt * address,PetscInt val)350 __device__ static int atomicMult(PetscInt* address,PetscInt val)
351 {
352   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
353   unsigned long long int old = *address_as_ull, assumed;
354   do {
355     assumed = old;
356     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
357   } while (assumed != old);
358   return (PetscInt)old;
359 }
360 #endif
361 
operator ()AtomicMult362 template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
363 
364 /*
365   Atomic Min/Max operations
366 
367   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
368 
369   int atomicMin(int* address, int val);
370   unsigned int atomicMin(unsigned int* address,unsigned int val);
371   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
372 
373   reads the 32-bit or 64-bit word old located at the address address in global or shared
374   memory, computes the minimum of old and val, and stores the result back to memory
375   at the same address. These three operations are performed in one atomic transaction.
376   The function returns old.
377   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
378 
379   atomicMax() is similar.
380  */
381 
382 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMin(double * address,double val)383 __device__ static double atomicMin(double* address, double val)
384 {
385   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
386   unsigned long long int old = *address_as_ull, assumed;
387   do {
388     assumed = old;
389     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
390   } while (assumed != old);
391   return __longlong_as_double(old);
392 }
393 
atomicMax(double * address,double val)394 __device__ static double atomicMax(double* address, double val)
395 {
396   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
397   unsigned long long int old = *address_as_ull, assumed;
398   do {
399     assumed  = old;
400     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
401   } while (assumed != old);
402   return __longlong_as_double(old);
403 }
404 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMin(float * address,float val)405 __device__ static float atomicMin(float* address,float val)
406 {
407   int *address_as_int = (int*)(address);
408   int old = *address_as_int, assumed;
409   do {
410     assumed = old;
411     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
412   } while (assumed != old);
413   return __int_as_float(old);
414 }
415 
atomicMax(float * address,float val)416 __device__ static float atomicMax(float* address,float val)
417 {
418   int *address_as_int = (int*)(address);
419   int old = *address_as_int, assumed;
420   do {
421     assumed = old;
422     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
423   } while (assumed != old);
424   return __int_as_float(old);
425 }
426 #endif
427 
428 /*
429   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
430   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
431   This causes compilation errors with pgi compilers and 64-bit indices:
432       error: function "atomicMin(long long *, long long)" has already been defined
433 
434   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
435 */
436 #if defined(PETSC_USE_64BIT_INDICES) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
atomicMin(PetscInt * address,PetscInt val)437 __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
438 {
439   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
440   unsigned long long int old = *address_as_ull, assumed;
441   do {
442     assumed = old;
443     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
444   } while (assumed != old);
445   return (PetscInt)old;
446 }
447 
atomicMax(PetscInt * address,PetscInt val)448 __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
449 {
450   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
451   unsigned long long int old = *address_as_ull, assumed;
452   do {
453     assumed = old;
454     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
455   } while (assumed != old);
456   return (PetscInt)old;
457 }
458 #endif
459 
operator ()AtomicMin460 template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
operator ()AtomicMax461 template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
462 
463 /*
464   Atomic bitwise operations
465 
466   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
467 
468   int atomicAnd(int* address, int val);
469   unsigned int atomicAnd(unsigned int* address,unsigned int val);
470   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
471 
472   reads the 32-bit or 64-bit word old located at the address address in global or shared
473   memory, computes (old & val), and stores the result back to memory at the same
474   address. These three operations are performed in one atomic transaction.
475   The function returns old.
476 
477   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
478 
479   atomicOr() and atomicXor are similar.
480 */
481 
482 #if defined(PETSC_USE_64BIT_INDICES)
483 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin(PetscInt* address,PetscInt val) */
atomicAnd(PetscInt * address,PetscInt val)484 __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
485 {
486   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
487   unsigned long long int old = *address_as_ull, assumed;
488   do {
489     assumed = old;
490     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
491   } while (assumed != old);
492   return (PetscInt)old;
493 }
atomicOr(PetscInt * address,PetscInt val)494 __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
495 {
496   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
497   unsigned long long int old = *address_as_ull, assumed;
498   do {
499     assumed = old;
500     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
501   } while (assumed != old);
502   return (PetscInt)old;
503 }
504 
atomicXor(PetscInt * address,PetscInt val)505 __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
506 {
507   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
508   unsigned long long int old = *address_as_ull, assumed;
509   do {
510     assumed = old;
511     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
512   } while (assumed != old);
513   return (PetscInt)old;
514 }
515 #else
516 /*
517  See also comments at atomicMin(PetscInt* address,PetscInt val)
518 __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
519 __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
520 __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
521 */
522 #endif
523 #endif
524 
operator ()AtomicBAND525 template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
operator ()AtomicBOR526 template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
operator ()AtomicBXOR527 template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
528 
529 /*
530   Atomic logical operations:
531 
532   CUDA has no atomic logical operations at all. We support them on integer types.
533 */
534 
535 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
536    which is what we want since we only support 32-bit and 64-bit integers.
537  */
538 template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
539 
540 template<typename Type,class Op>
541 struct AtomicLogical<Type,Op,4> {
operator ()AtomicLogical542   __device__ Type operator()(Type& x,Type y) const {
543     int *address_as_int = (int*)(&x);
544     int old = *address_as_int, assumed;
545     Op op;
546     do {
547       assumed = old;
548       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
549     } while (assumed != old);
550     return (Type)old;
551   }
552 };
553 
554 template<typename Type,class Op>
555 struct AtomicLogical<Type,Op,8> {
operator ()AtomicLogical556   __device__ Type operator()(Type& x,Type y) const {
557     unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
558     unsigned long long int old = *address_as_ull, assumed;
559     Op op;
560     do {
561       assumed = old;
562       old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
563     } while (assumed != old);
564     return (Type)old;
565   }
566 };
567 
568 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
operator ()land569 template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
operator ()lor570 template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
operator ()lxor571 template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
572 
operator ()AtomicLAND573 template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
operator ()AtomicLOR574 template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
operator ()AtomicLXOR575 template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
576 
577 /*====================================================================================*/
578 /*  Wrapper functions of cuda kernels. Function pointers are stored in 'link'         */
579 /*====================================================================================*/
580 template<typename Type,PetscInt BS,PetscInt EQ>
Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,const void * data,void * buf)581 static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
582 {
583   cudaError_t        cerr;
584   PetscInt           nthreads=256;
585   PetscInt           nblocks=(count+nthreads-1)/nthreads;
586   const PetscInt     *iarray=opt ? opt->array : NULL;
587 
588   PetscFunctionBegin;
589   if (!count) PetscFunctionReturn(0);
590   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
591   d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
592   cerr = cudaGetLastError();CHKERRCUDA(cerr);
593   PetscFunctionReturn(0);
594 }
595 
596 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,const void * buf)597 static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
598 {
599   cudaError_t        cerr;
600   PetscInt           nthreads=256;
601   PetscInt           nblocks=(count+nthreads-1)/nthreads;
602   const PetscInt     *iarray=opt ? opt->array : NULL;
603 
604   PetscFunctionBegin;
605   if (!count) PetscFunctionReturn(0);
606   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
607   d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
608   cerr = cudaGetLastError();CHKERRCUDA(cerr);
609   PetscFunctionReturn(0);
610 }
611 
612 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,void * buf)613 static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
614 {
615   cudaError_t        cerr;
616   PetscInt           nthreads=256;
617   PetscInt           nblocks=(count+nthreads-1)/nthreads;
618   const PetscInt     *iarray=opt ? opt->array : NULL;
619 
620   PetscFunctionBegin;
621   if (!count) PetscFunctionReturn(0);
622   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
623   d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
624   cerr = cudaGetLastError();CHKERRCUDA(cerr);
625   PetscFunctionReturn(0);
626 }
627 
628 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)629 static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
630 {
631   cudaError_t        cerr;
632   PetscInt           nthreads=256;
633   PetscInt           nblocks=(count+nthreads-1)/nthreads;
634   PetscInt           srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
635 
636   PetscFunctionBegin;
637   if (!count) PetscFunctionReturn(0);
638   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
639 
640   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
641   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
642   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
643 
644   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
645   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
646 
647   d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
648   cerr = cudaGetLastError();CHKERRCUDA(cerr);
649   PetscFunctionReturn(0);
650 }
651 
652 /* Specialization for Insert since we may use cudaMemcpyAsync */
653 template<typename Type,PetscInt BS,PetscInt EQ>
ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)654 static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
655 {
656   PetscErrorCode    ierr;
657   cudaError_t       cerr;
658 
659   PetscFunctionBegin;
660   if (!count) PetscFunctionReturn(0);
661   /*src and dst are contiguous */
662   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
663     cerr = cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
664   } else {
665     ierr = ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);CHKERRQ(ierr);
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 template<typename Type,class Op,PetscInt BS,PetscInt EQ>
FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt * rootidx,void * rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt * leafidx,const void * leafdata,void * leafupdate)671 static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
672 {
673   cudaError_t       cerr;
674   PetscInt          nthreads=256;
675   PetscInt          nblocks=(count+nthreads-1)/nthreads;
676   const PetscInt    *rarray = rootopt ? rootopt->array : NULL;
677   const PetscInt    *larray = leafopt ? leafopt->array : NULL;
678 
679   PetscFunctionBegin;
680   if (!count) PetscFunctionReturn(0);
681   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
682   d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
683   cerr = cudaGetLastError();CHKERRCUDA(cerr);
684   PetscFunctionReturn(0);
685 }
686 
687 /*====================================================================================*/
688 /*  Init various types and instantiate pack/unpack function pointers                  */
689 /*====================================================================================*/
690 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_RealType(PetscSFLink link)691 static void PackInit_RealType(PetscSFLink link)
692 {
693   /* Pack/unpack for remote communication */
694   link->d_Pack              = Pack<Type,BS,EQ>;
695   link->d_UnpackAndInsert   = UnpackAndOp     <Type,Insert<Type>      ,BS,EQ>;
696   link->d_UnpackAndAdd      = UnpackAndOp     <Type,Add<Type>         ,BS,EQ>;
697   link->d_UnpackAndMult     = UnpackAndOp     <Type,Mult<Type>        ,BS,EQ>;
698   link->d_UnpackAndMin      = UnpackAndOp     <Type,Min<Type>         ,BS,EQ>;
699   link->d_UnpackAndMax      = UnpackAndOp     <Type,Max<Type>         ,BS,EQ>;
700   link->d_FetchAndAdd       = FetchAndOp      <Type,Add<Type>         ,BS,EQ>;
701 
702   /* Scatter for local communication */
703   link->d_ScatterAndInsert  = ScatterAndInsert<Type                   ,BS,EQ>; /* Has special optimizations */
704   link->d_ScatterAndAdd     = ScatterAndOp    <Type,Add<Type>         ,BS,EQ>;
705   link->d_ScatterAndMult    = ScatterAndOp    <Type,Mult<Type>        ,BS,EQ>;
706   link->d_ScatterAndMin     = ScatterAndOp    <Type,Min<Type>         ,BS,EQ>;
707   link->d_ScatterAndMax     = ScatterAndOp    <Type,Max<Type>         ,BS,EQ>;
708   link->d_FetchAndAddLocal  = FetchAndOpLocal <Type,Add <Type>        ,BS,EQ>;
709 
710   /* Atomic versions when there are data-race possibilities */
711   link->da_UnpackAndInsert  = UnpackAndOp     <Type,AtomicInsert<Type>,BS,EQ>;
712   link->da_UnpackAndAdd     = UnpackAndOp     <Type,AtomicAdd<Type>   ,BS,EQ>;
713   link->da_UnpackAndMult    = UnpackAndOp     <Type,AtomicMult<Type>  ,BS,EQ>;
714   link->da_UnpackAndMin     = UnpackAndOp     <Type,AtomicMin<Type>   ,BS,EQ>;
715   link->da_UnpackAndMax     = UnpackAndOp     <Type,AtomicMax<Type>   ,BS,EQ>;
716   link->da_FetchAndAdd      = FetchAndOp      <Type,AtomicAdd<Type>   ,BS,EQ>;
717 
718   link->da_ScatterAndInsert = ScatterAndOp    <Type,AtomicInsert<Type>,BS,EQ>;
719   link->da_ScatterAndAdd    = ScatterAndOp    <Type,AtomicAdd<Type>   ,BS,EQ>;
720   link->da_ScatterAndMult   = ScatterAndOp    <Type,AtomicMult<Type>  ,BS,EQ>;
721   link->da_ScatterAndMin    = ScatterAndOp    <Type,AtomicMin<Type>   ,BS,EQ>;
722   link->da_ScatterAndMax    = ScatterAndOp    <Type,AtomicMax<Type>   ,BS,EQ>;
723   link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type>   ,BS,EQ>;
724 }
725 
726 /* Have this templated class to specialize for char integers */
727 template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
728 struct PackInit_IntegerType_Atomic {
InitPackInit_IntegerType_Atomic729   static void Init(PetscSFLink link) {
730     link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
731     link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
732     link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
733     link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
734     link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
735     link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
736     link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
737     link->da_UnpackAndLXOR    = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
738     link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
739     link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
740     link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
741     link->da_FetchAndAdd      = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;
742 
743     link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
744     link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
745     link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
746     link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
747     link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
748     link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
749     link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
750     link->da_ScatterAndLXOR   = ScatterAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
751     link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
752     link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
753     link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
754     link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
755   }
756 };
757 
758 /* CUDA does not support atomics on chars. It is TBD in PETSc. */
759 template<typename Type,PetscInt BS,PetscInt EQ>
760 struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
InitPackInit_IntegerType_Atomic761   static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
762 };
763 
764 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_IntegerType(PetscSFLink link)765 static void PackInit_IntegerType(PetscSFLink link)
766 {
767   link->d_Pack            = Pack<Type,BS,EQ>;
768   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
769   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
770   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
771   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
772   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
773   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
774   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
775   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
776   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
777   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
778   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;
779   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
780 
781   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
782   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
783   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
784   link->d_ScatterAndMin    = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
785   link->d_ScatterAndMax    = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
786   link->d_ScatterAndLAND   = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
787   link->d_ScatterAndLOR    = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
788   link->d_ScatterAndLXOR   = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
789   link->d_ScatterAndBAND   = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
790   link->d_ScatterAndBOR    = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
791   link->d_ScatterAndBXOR   = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
792   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
793   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
794 }
795 
796 #if defined(PETSC_HAVE_COMPLEX)
797 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_ComplexType(PetscSFLink link)798 static void PackInit_ComplexType(PetscSFLink link)
799 {
800   link->d_Pack             = Pack<Type,BS,EQ>;
801   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
802   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
803   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
804   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
805 
806   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
807   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
808   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
809   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
810 
811   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
812   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
813   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
814   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */
815 
816   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
817   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
818 }
819 #endif
820 
821 typedef signed char                      SignedChar;
822 typedef unsigned char                    UnsignedChar;
823 typedef struct {int a;      int b;     } PairInt;
824 typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
825 
826 template<typename Type>
PackInit_PairType(PetscSFLink link)827 static void PackInit_PairType(PetscSFLink link)
828 {
829   link->d_Pack            = Pack<Type,1,1>;
830   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
831   link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
832   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
833 
834   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
835   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
836   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
837   /* Atomics for pair types are not implemented yet */
838 }
839 
840 template<typename Type,PetscInt BS,PetscInt EQ>
PackInit_DumbType(PetscSFLink link)841 static void PackInit_DumbType(PetscSFLink link)
842 {
843   link->d_Pack             = Pack<Type,BS,EQ>;
844   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
845   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
846   /* Atomics for dumb types are not implemented yet */
847 }
848 
849 /* Some device-specific utilities */
PetscSFLinkSyncDevice_Cuda(PetscSFLink link)850 static PetscErrorCode PetscSFLinkSyncDevice_Cuda(PetscSFLink link)
851 {
852   cudaError_t cerr;
853   PetscFunctionBegin;
854   cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
855   PetscFunctionReturn(0);
856 }
857 
PetscSFLinkSyncStream_Cuda(PetscSFLink link)858 static PetscErrorCode PetscSFLinkSyncStream_Cuda(PetscSFLink link)
859 {
860   cudaError_t cerr;
861   PetscFunctionBegin;
862   cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
863   PetscFunctionReturn(0);
864 }
865 
PetscSFLinkMemcpy_Cuda(PetscSFLink link,PetscMemType dstmtype,void * dst,PetscMemType srcmtype,const void * src,size_t n)866 static PetscErrorCode PetscSFLinkMemcpy_Cuda(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
867 {
868   PetscFunctionBegin;
869   enum cudaMemcpyKind kinds[2][2] = {{cudaMemcpyHostToHost,cudaMemcpyHostToDevice},{cudaMemcpyDeviceToHost,cudaMemcpyDeviceToDevice}};
870 
871   if (n) {
872     if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
873       PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);
874     } else { /* Assume PETSC_MEMTYPE_HOST=0, PETSC_MEMTYPE_DEVICE=1 */
875       cudaError_t err = cudaMemcpyAsync(dst,src,n,kinds[srcmtype][dstmtype],link->stream);CHKERRCUDA(err);
876     }
877   }
878   PetscFunctionReturn(0);
879 }
880 
PetscSFMalloc_Cuda(PetscMemType mtype,size_t size,void ** ptr)881 PetscErrorCode PetscSFMalloc_Cuda(PetscMemType mtype,size_t size,void** ptr)
882 {
883   PetscFunctionBegin;
884   if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMalloc(size,ptr);CHKERRQ(ierr);}
885   else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMalloc(ptr,size);CHKERRCUDA(err);}
886   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
887   PetscFunctionReturn(0);
888 }
889 
PetscSFFree_Cuda(PetscMemType mtype,void * ptr)890 PetscErrorCode PetscSFFree_Cuda(PetscMemType mtype,void* ptr)
891 {
892   PetscFunctionBegin;
893   if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscFree(ptr);CHKERRQ(ierr);}
894   else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaFree(ptr);CHKERRCUDA(err);}
895   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
896   PetscFunctionReturn(0);
897 }
898 
899 /*====================================================================================*/
900 /*                Main driver to init MPI datatype on device                          */
901 /*====================================================================================*/
902 
903 /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
PetscSFLinkSetUp_Cuda(PetscSF sf,PetscSFLink link,MPI_Datatype unit)904 PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
905 {
906   PetscErrorCode ierr;
907   cudaError_t    err;
908   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
909   PetscBool      is2Int,is2PetscInt;
910 #if defined(PETSC_HAVE_COMPLEX)
911   PetscInt       nPetscComplex=0;
912 #endif
913 
914   PetscFunctionBegin;
915   if (link->deviceinited) PetscFunctionReturn(0);
916   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
917   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
918   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
919   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
920   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
921   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
922 #if defined(PETSC_HAVE_COMPLEX)
923   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
924 #endif
925   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
926   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
927 
928   if (is2Int) {
929     PackInit_PairType<PairInt>(link);
930   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
931     PackInit_PairType<PairPetscInt>(link);
932   } else if (nPetscReal) {
933     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
934     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
935     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
936     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
937   } else if (nPetscInt) {
938     if      (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
939     else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
940     else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
941     else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
942 #if defined(PETSC_USE_64BIT_INDICES)
943   } else if (nInt) {
944     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
945     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
946     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
947     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
948 #endif
949   } else if (nSignedChar) {
950     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
951     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
952     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
953     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
954   }  else if (nUnsignedChar) {
955     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
956     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
957     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
958     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
959 #if defined(PETSC_HAVE_COMPLEX)
960   } else if (nPetscComplex) {
961     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
962     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
963     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
964     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
965 #endif
966   } else {
967     MPI_Aint lb,nbyte;
968     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
969     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
970     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
971       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
972       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
973       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
974     } else {
975       nInt = nbyte / sizeof(int);
976       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
977       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
978       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
979       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
980     }
981   }
982 
983   if (!sf->use_default_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
984   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
985     int                   device;
986     struct cudaDeviceProp props;
987     err = cudaGetDevice(&device);CHKERRCUDA(err);
988     err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
989     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
990   }
991   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
992 
993   link->d_SyncDevice =  PetscSFLinkSyncDevice_Cuda;
994   link->d_SyncStream =  PetscSFLinkSyncStream_Cuda;
995   link->Memcpy       =  PetscSFLinkMemcpy_Cuda;
996   link->deviceinited = PETSC_TRUE;
997   PetscFunctionReturn(0);
998 }
999