1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 #ifndef CRM_CUDA_UTILS
25 #define CRM_CUDA_UTILS
26 
27 //split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
28 #define MIN(a,b) ((a) < (b) ? (a) : (b))
29 #define MAX(a,b) ((a) > (b) ? (a) : (b))
30 
getgrid(int n,int shared_per_thread=0,int threadsmax=256,bool p2=false)31 inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
32 {
33   int3 gridparams;
34   int sharedsize = 16000;
35 
36   if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
37 
38   if((n < 60 * 32) || (threadsmax < 64))
39     gridparams.z = 32;
40   else if((n < 60 * 64) || (threadsmax < 128))
41     gridparams.z = 64;
42   else if((n < 60 * 128) || (threadsmax < 256))
43     gridparams.z = 128;
44   else if((n < 60 * 256) || (threadsmax < 512))
45     gridparams.z = 256;
46   else gridparams.z = 512;
47 
48   if(p2) {
49     gridparams.z = 16;
50 
51     while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
52   }
53 
54 
55   int blocks = (n + gridparams.z - 1) / gridparams.z;
56 
57   if(blocks > 10000)
58     gridparams.x = gridparams.y = int(sqrt(blocks));
59   else {
60     gridparams.x = blocks;
61     gridparams.y = 1;
62   }
63 
64   while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
65 
66   if(gridparams.x == 0) gridparams.x = 1;
67 
68   return gridparams;
69 }
70 
71 //return value: 1 if f<0; else: 0
72 //take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
negativCUDA(float f)73 static inline __device__ int negativCUDA(float f)
74 {
75   return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
76 }
77 
78 //return value: -1 if f<0; else +1
fsignCUDA(float f)79 static inline __device__ float fsignCUDA(float f)
80 {
81   return f < 0.0f ? -1.0f : 1.0f;
82 }
83 
84 //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
85 //blockDim.y and blockDim.z are assumed to be 1
copySharedToGlob(int * shared,int * glob,const int & n)86 static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
87 {
88   int i, k;
89   k = n - blockDim.x;
90 
91   for(i = 0; i < k; i += blockDim.x) {
92     glob[i + threadIdx.x] = shared[i + threadIdx.x];
93   }
94 
95   if(threadIdx.x < n - i) {
96     glob[i + threadIdx.x] = shared[i + threadIdx.x];
97   }
98 
99   __syncthreads();
100 }
101 
copySharedToGlob(float * shared,float * glob,const int & n)102 static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
103 {
104   int i, k;
105   k = n - blockDim.x;
106 
107   for(i = 0; i < k; i += blockDim.x) {
108     glob[i + threadIdx.x] = shared[i + threadIdx.x];
109   }
110 
111   if(threadIdx.x < n - i) {
112     glob[i + threadIdx.x] = shared[i + threadIdx.x];
113   }
114 
115   __syncthreads();
116 }
117 
copySharedToGlob(double * shared,double * glob,const int & n)118 static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
119 {
120   int i, k;
121   k = n - blockDim.x;
122 
123   for(i = 0; i < k; i += blockDim.x) {
124     glob[i + threadIdx.x] = shared[i + threadIdx.x];
125   }
126 
127   if(threadIdx.x < n - i) {
128     glob[i + threadIdx.x] = shared[i + threadIdx.x];
129   }
130 
131   __syncthreads();
132 }
133 
copyGlobToShared(int * glob,int * shared,const int & n)134 static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
135 {
136   int i, k;
137   k = n - blockDim.x;
138 
139   for(i = 0; i < k; i += blockDim.x) {
140     shared[i + threadIdx.x] = glob[i + threadIdx.x];
141   }
142 
143   if(threadIdx.x < n - i) {
144     shared[i + threadIdx.x] = glob[i + threadIdx.x];
145   }
146 
147   __syncthreads();
148 }
149 
copyGlobToShared(float * glob,float * shared,const int & n)150 static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
151 {
152   int i, k;
153   k = n - blockDim.x;
154 
155   for(i = 0; i < k; i += blockDim.x) {
156     shared[i + threadIdx.x] = glob[i + threadIdx.x];
157   }
158 
159   if(threadIdx.x < n - i) {
160     shared[i + threadIdx.x] = glob[i + threadIdx.x];
161   }
162 
163   __syncthreads();
164 }
165 
copyGlobToShared(double * glob,double * shared,const int & n)166 static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
167 {
168   int i;
169 
170   for(i = 0; i < n - blockDim.x; i += blockDim.x) {
171     shared[i + threadIdx.x] = glob[i + threadIdx.x];
172   }
173 
174   if(threadIdx.x < n - i) {
175     shared[i + threadIdx.x] = glob[i + threadIdx.x];
176   }
177 
178   __syncthreads();
179 }
180 
181 //copy data between two memory areas on device, 3d BlockDims are allowed
copyData(double * source,double * target,const int & n)182 static __device__ inline void copyData(double* source, double* target, const int &n)
183 {
184   int i;
185   int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
186 
187   for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
188     target[i + offset] = source[i + offset];
189   }
190 
191   if(offset < n - i) {
192     target[i + offset] = source[i + offset];
193   }
194 
195   __syncthreads();
196 }
197 
copyData(float * source,float * target,const int & n)198 static __device__ inline void copyData(float* source, float* target, const int &n)
199 {
200   int i;
201   int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
202 
203   for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
204     target[i + offset] = source[i + offset];
205   }
206 
207   if(offset < n - i) {
208     target[i + offset] = source[i + offset];
209   }
210 
211   __syncthreads();
212 }
213 
copyData(int * source,int * target,const int & n)214 static __device__ inline void copyData(int* source, int* target, const int &n)
215 {
216   int i;
217   int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
218 
219   for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
220     target[i + offset] = source[i + offset];
221   }
222 
223   if(offset < n - i) {
224     target[i + offset] = source[i + offset];
225   }
226 
227   __syncthreads();
228 }
229 
copyData(unsigned int * source,unsigned int * target,const int & n)230 static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
231 {
232   int i;
233   int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
234 
235   for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
236     target[i + offset] = source[i + offset];
237   }
238 
239   if(offset < n - i) {
240     target[i + offset] = source[i + offset];
241   }
242 
243   __syncthreads();
244 }
245 
246 //functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
247 //in the end in data[0]=sum_i=0^blockDim.x data[i]
248 //for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
reduceBlockP2(int * data)249 static __device__ inline void reduceBlockP2(int* data)
250 {
251   __syncthreads();
252 
253   for(int i = 2; i <= blockDim.x; i *= 2) {
254     if(threadIdx.x < blockDim.x / i)
255       data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
256 
257     __syncthreads();
258   }
259 }
260 
reduceBlockP2(unsigned int * data)261 static __device__ inline void reduceBlockP2(unsigned int* data)
262 {
263   __syncthreads();
264 
265   for(int i = 2; i <= blockDim.x; i *= 2) {
266     if(threadIdx.x < blockDim.x / i)
267       data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
268 
269     __syncthreads();
270   }
271 }
272 
reduceBlockP2(float * data)273 static __device__ inline void reduceBlockP2(float* data)
274 {
275   __syncthreads();
276 
277   for(int i = 2; i <= blockDim.x; i *= 2) {
278     if(threadIdx.x < blockDim.x / i)
279       data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
280 
281     __syncthreads();
282   }
283 }
284 
reduceBlockP2(double * data)285 static __device__ inline void reduceBlockP2(double* data)
286 {
287   __syncthreads();
288 
289   for(int i = 2; i <= blockDim.x; i *= 2) {
290     if(threadIdx.x < blockDim.x / i)
291       data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
292 
293     __syncthreads();
294   }
295 }
296 
reduceBlock(float * data)297 static __device__ inline void reduceBlock(float* data)
298 {
299   __syncthreads();
300   int p2 = 1;
301 
302   while(p2 * 2 < blockDim.x) p2 *= 2;
303 
304   if(threadIdx.x < blockDim.x - p2)
305     data[threadIdx.x] += data[threadIdx.x + p2];
306 
307   __syncthreads();
308 
309   for(int i = 2; i <= p2; i *= 2) {
310     if(threadIdx.x < p2 / i)
311       data[threadIdx.x] += data[threadIdx.x + p2 / i];
312 
313     __syncthreads();
314   }
315 }
316 
reduceBlock(int * data)317 static __device__ inline void reduceBlock(int* data)
318 {
319   __syncthreads();
320   int p2 = 1;
321 
322   while(p2 * 2 < blockDim.x) p2 *= 2;
323 
324   if(threadIdx.x < blockDim.x - p2)
325     data[threadIdx.x] += data[threadIdx.x + p2];
326 
327   __syncthreads();
328 
329   for(int i = 2; i <= p2; i *= 2) {
330     if(threadIdx.x < p2 / i)
331       data[threadIdx.x] += data[threadIdx.x + p2 / i];
332 
333     __syncthreads();
334   }
335 }
336 
reduceBlock(unsigned int * data)337 static __device__ inline void reduceBlock(unsigned int* data)
338 {
339   __syncthreads();
340   int p2 = 1;
341 
342   while(p2 * 2 < blockDim.x) p2 *= 2;
343 
344   if(threadIdx.x < blockDim.x - p2)
345     data[threadIdx.x] += data[threadIdx.x + p2];
346 
347   __syncthreads();
348 
349   for(int i = 2; i <= p2; i *= 2) {
350     if(threadIdx.x < p2 / i)
351       data[threadIdx.x] += data[threadIdx.x + p2 / i];
352 
353     __syncthreads();
354   }
355 }
356 
reduceBlock(double * data)357 static __device__ inline void reduceBlock(double* data)
358 {
359   __syncthreads();
360   int p2 = 1;
361 
362   while(p2 * 2 < blockDim.x) p2 *= 2;
363 
364   if(threadIdx.x < blockDim.x - p2)
365     data[threadIdx.x] += data[threadIdx.x + p2];
366 
367   __syncthreads();
368 
369   for(int i = 2; i <= p2; i *= 2) {
370     if(threadIdx.x < p2 / i)
371       data[threadIdx.x] += data[threadIdx.x + p2 / i];
372 
373     __syncthreads();
374   }
375 }
376 
cudaFillBlockData_int(int * data,const int & n,const int & value)377 static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
378 {
379   int i;
380 
381   for(i = 0; i < n - blockDim.x; i += blockDim.x) {
382     data[i + threadIdx.x] = value;
383   }
384 
385   if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
386 }
387 
cudaFillBlockData_float(float * data,const int & n,const float & value)388 static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
389 {
390   int i;
391 
392   for(i = 0; i < n - blockDim.x; i += blockDim.x) {
393     data[i + threadIdx.x] = value;
394   }
395 
396   if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
397 }
398 
reduce(float * data,int n)399 static __device__ inline void reduce(float* data, int n) //cautious not sure if working
400 {
401   __syncthreads();
402   int p2 = 1;
403 
404   while(p2 * 2 < n) p2 *= 2;
405 
406   int j = 0;
407 
408   while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
409     data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
410     j++;
411   }
412 
413   __syncthreads();
414 
415   for(int i = 2; i <= p2; i *= 2) {
416     while((threadIdx.x + blockDim.x * j) < p2 / i) {
417       data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
418       j++;
419     }
420 
421     __syncthreads();
422   }
423 }
424 
reduce(double * data,int n)425 static __device__ inline void reduce(double* data, int n) //cautious not sure if working
426 {
427   __syncthreads();
428   int p2 = 1;
429 
430   while(p2 * 2 < n) p2 *= 2;
431 
432   int j = 0;
433 
434   while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
435     data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
436     j++;
437   }
438 
439   __syncthreads();
440 
441   for(int i = 2; i <= p2; i *= 2) {
442     while((threadIdx.x + blockDim.x * j) < p2 / i) {
443       data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
444       j++;
445     }
446 
447     __syncthreads();
448   }
449 }
450 
minOfBlock(float * data)451 static __device__ inline void minOfBlock(float* data)
452 {
453   __syncthreads();
454   int p2 = 1;
455 
456   while(p2 * 2 < blockDim.x) p2 *= 2;
457 
458   if(threadIdx.x < blockDim.x - p2)
459     data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
460 
461   __syncthreads();
462 
463   for(int i = 2; i <= p2; i *= 2) {
464     if(threadIdx.x < p2 / i)
465       data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
466 
467     __syncthreads();
468   }
469 }
470 
maxOfBlock(float * data)471 static __device__ inline void maxOfBlock(float* data)
472 {
473   __syncthreads();
474   int p2 = 1;
475 
476   while(p2 * 2 < blockDim.x) p2 *= 2;
477 
478   if(threadIdx.x < blockDim.x - p2)
479     data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
480 
481   __syncthreads();
482 
483   for(int i = 2; i <= p2; i *= 2) {
484     if(threadIdx.x < p2 / i)
485       data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
486 
487     __syncthreads();
488   }
489 }
490 
minOfBlock(double * data)491 static __device__ inline void minOfBlock(double* data)
492 {
493   __syncthreads();
494   int p2 = 1;
495 
496   while(p2 * 2 < blockDim.x) p2 *= 2;
497 
498   if(threadIdx.x < blockDim.x - p2)
499     data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
500 
501   __syncthreads();
502 
503   for(int i = 2; i <= p2; i *= 2) {
504     if(threadIdx.x < p2 / i)
505       data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
506 
507     __syncthreads();
508   }
509 }
510 
maxOfBlock(double * data)511 static __device__ inline void maxOfBlock(double* data)
512 {
513   __syncthreads();
514   int p2 = 1;
515 
516   while(p2 * 2 < blockDim.x) p2 *= 2;
517 
518   if(threadIdx.x < blockDim.x - p2)
519     data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
520 
521   __syncthreads();
522 
523   for(int i = 2; i <= p2; i *= 2) {
524     if(threadIdx.x < p2 / i)
525       data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
526 
527     __syncthreads();
528   }
529 }
530 
531 
minOfData(double * data,int n)532 static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
533 {
534   __syncthreads();
535   int p2 = 1;
536 
537   while(p2 * 2 < n) p2 *= 2;
538 
539   int j = 0;
540 
541   while((threadIdx.x + blockDim.x * j) < n - p2) {
542     data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
543     j++;
544   }
545 
546   __syncthreads();
547 
548   for(int i = 2; i <= p2; i *= 2) {
549     while((threadIdx.x + blockDim.x * j) < p2 / i) {
550       data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
551       j++;
552     }
553 
554     __syncthreads();
555   }
556 }
557 
maxOfData(double * data,int n)558 static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
559 {
560   __syncthreads();
561   int p2 = 1;
562 
563   while(p2 * 2 < n) p2 *= 2;
564 
565   int j = 0;
566 
567   while((threadIdx.x + blockDim.x * j) < n - p2) {
568     data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
569     j++;
570   }
571 
572   __syncthreads();
573 
574   for(int i = 2; i <= p2; i *= 2) {
575     while((threadIdx.x + blockDim.x * j) < p2 / i) {
576       data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
577       j++;
578     }
579 
580     __syncthreads();
581   }
582 }
583 
minOfData(float * data,int n)584 static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
585 {
586   __syncthreads();
587   int p2 = 1;
588 
589   while(p2 * 2 < n) p2 *= 2;
590 
591   int j = 0;
592 
593   while((threadIdx.x + blockDim.x * j) < n - p2) {
594     data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
595     j++;
596   }
597 
598   __syncthreads();
599 
600   for(int i = 2; i <= p2; i *= 2) {
601     while((threadIdx.x + blockDim.x * j) < p2 / i) {
602       data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
603       j++;
604     }
605 
606     __syncthreads();
607   }
608 }
609 
maxOfData(float * data,int n)610 static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
611 {
612   __syncthreads();
613   int p2 = 1;
614 
615   while(p2 * 2 < n) p2 *= 2;
616 
617   int j = 0;
618 
619   while((threadIdx.x + blockDim.x * j) < n - p2) {
620     data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
621     j++;
622   }
623 
624   __syncthreads();
625 
626   for(int i = 2; i <= p2; i *= 2) {
627     while((threadIdx.x + blockDim.x * j) < p2 / i) {
628       data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
629       j++;
630     }
631 
632     __syncthreads();
633   }
634 }
635 
636 #if X_PRECISION == 2
tex1Dfetch_double(texture<int2,1> t,int i)637 static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
638 {
639   int2 v = tex1Dfetch(t, i);
640   return __hiloint2double(v.y, v.x);
641 }
642 
tex1Dfetch_double(texture<int4,1> t,int i)643 static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
644 {
645   int4 v = tex1Dfetch(t, 2 * i);
646   int4 u = tex1Dfetch(t, 2 * i + 1);
647   X_FLOAT4 w;
648 
649   w.x = __hiloint2double(v.y, v.x);
650   w.y = __hiloint2double(v.w, v.z);
651   w.z = __hiloint2double(u.y, u.x);
652   w.w = __hiloint2double(u.w, u.z);
653   return w;
654 }
655 #endif
656 
BindXTypeTexture(cuda_shared_data * sdata)657 inline void BindXTypeTexture(cuda_shared_data* sdata)
658 {
659 #ifdef CUDA_USE_TEXTURE
660   _x_type_tex.normalized = false;                      // access with normalized texture coordinates
661   _x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
662   _x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
663   const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
664 
665 #if X_PRECISION == 1
666   cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
667   cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
668 #else
669   cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
670   cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
671 #endif
672 #endif
673 }
674 
fetchXType(int i)675 static __device__ inline X_FLOAT4 fetchXType(int i)
676 {
677 #ifdef CUDA_USE_TEXTURE
678 #if X_PRECISION == 1
679   return tex1Dfetch(_x_type_tex, i);
680 #else
681   return tex1Dfetch_double(_x_type_tex, i);
682 #endif
683 #else
684   return _x_type[i];
685 #endif
686 }
687 
688 #if V_PRECISION == 2
tex1Dfetch_double_v(texture<int2,1> t,int i)689 static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
690 {
691   int2 v = tex1Dfetch(t, i);
692   return __hiloint2double(v.y, v.x);
693 }
694 
tex1Dfetch_double_v(texture<int4,1> t,int i)695 static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
696 {
697   int4 v = tex1Dfetch(t, 2 * i);
698   int4 u = tex1Dfetch(t, 2 * i + 1);
699   V_FLOAT4 w;
700 
701   w.x = __hiloint2double(v.y, v.x);
702   w.y = __hiloint2double(v.w, v.z);
703   w.z = __hiloint2double(u.y, u.x);
704   w.w = __hiloint2double(u.w, u.z);
705   return w;
706 }
707 #endif
708 
BindVRadiusTexture(cuda_shared_data * sdata)709 inline void BindVRadiusTexture(cuda_shared_data* sdata)
710 {
711 #ifdef CUDA_USE_TEXTURE
712   _v_radius_tex.normalized = false;                      // access with normalized texture coordinates
713   _v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
714   _v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
715   const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
716 
717 #if V_PRECISION == 1
718   cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
719   cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
720 #else
721   cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
722   cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
723 #endif
724 #endif
725 }
726 
fetchVRadius(int i)727 static __device__ inline V_FLOAT4 fetchVRadius(int i)
728 {
729 #ifdef CUDA_USE_TEXTURE
730 #if V_PRECISION == 1
731   return tex1Dfetch(_v_radius_tex, i);
732 #else
733   return tex1Dfetch_double_v(_v_radius_tex, i);
734 #endif
735 #else
736   return _v_radius[i];
737 #endif
738 }
739 
BindOmegaRmassTexture(cuda_shared_data * sdata)740 inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
741 {
742 #ifdef CUDA_USE_TEXTURE
743   _omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
744   _omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
745   _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
746   const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
747 
748 #if V_PRECISION == 1
749   cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
750   cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
751 #else
752   cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
753   cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
754 #endif
755 #endif
756 }
757 
fetchOmegaRmass(int i)758 static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
759 {
760 #ifdef CUDA_USE_TEXTURE
761 #if V_PRECISION == 1
762   return tex1Dfetch(_omega_rmass_tex, i);
763 #else
764   return tex1Dfetch_double_v(_omega_rmass_tex, i);
765 #endif
766 #else
767   return _omega_rmass[i];
768 #endif
769 }
770 
771 #if F_PRECISION == 2
tex1Dfetch_double_f(texture<int2,1> t,int i)772 static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
773 {
774   int2 v = tex1Dfetch(t, i);
775   return __hiloint2double(v.y, v.x);
776 }
777 
tex1Dfetch_double_f(texture<int4,1> t,int i)778 static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
779 {
780   int4 v = tex1Dfetch(t, 2 * i);
781   int4 u = tex1Dfetch(t, 2 * i + 1);
782   F_FLOAT4 w;
783 
784   w.x = __hiloint2double(v.y, v.x);
785   w.y = __hiloint2double(v.w, v.z);
786   w.z = __hiloint2double(u.y, u.x);
787   w.w = __hiloint2double(u.w, u.z);
788   return w;
789 }
790 #endif
791 
BindQTexture(cuda_shared_data * sdata)792 inline void BindQTexture(cuda_shared_data* sdata)
793 {
794 #ifdef CUDA_USE_TEXTURE
795   _q_tex.normalized = false;                      // access with normalized texture coordinates
796   _q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
797   _q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
798   const textureReference* q_texture_ptr = &MY_AP(q_tex);
799 
800 #if F_PRECISION == 1
801   cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
802   cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
803 #else
804   cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
805   cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
806 #endif
807 #endif
808 }
809 
fetchQ(int i)810 static __device__ inline F_FLOAT fetchQ(int i)
811 {
812 #ifdef CUDA_USE_TEXTURE
813 #if F_PRECISION == 1
814   return tex1Dfetch(_q_tex, i);
815 #else
816   return tex1Dfetch_double_f(_q_tex, i);
817 #endif
818 #else
819   return _q[i];
820 #endif
821 }
822 
823 #endif
824 
825 /*
826 
827 inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
828 {
829 	#ifdef CUDA_USE_TEXTURE
830 		_coeff_tex.normalized = false;                      // access with normalized texture coordinates
831 		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
832 		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
833 		const textureReference* coeff_texture_ptr;
834 		cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
835 
836 		#if F_PRECISION == 1
837 		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
838 		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
839 		#else
840 		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
841 		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
842 		#endif
843 	#endif
844 }
845 
846 static __device__ inline X_FLOAT4 fetchXType(int i)
847 {
848 		#ifdef CUDA_USE_TEXTURE
849 		  #if X_PRECISION == 1
850 		     return tex1Dfetch(_x_type_tex,i);
851 		  #else
852 		     return tex1Dfetch_double(_x_type_tex,i);
853 		  #endif
854 		#else
855 		  return _x_type[i];
856 		#endif
857 }
858 */
859 #define SBBITS 30
860 
sbmask(int j)861 static inline __device__ int sbmask(int j)
862 {
863   return j >> SBBITS & 3;
864 }
865 
minimum_image(X_FLOAT4 & delta)866 static inline __device__ void minimum_image(X_FLOAT4 &delta)
867 {
868   if(_triclinic == 0) {
869     if(_periodicity[0]) {
870       delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
871                  (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
872     }
873 
874     if(_periodicity[1]) {
875       delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
876                  (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
877     }
878 
879     if(_periodicity[2]) {
880       delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
881                  (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
882     }
883 
884   } else {
885     if(_periodicity[1]) {
886       delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
887                  (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
888       delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
889                  (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
890       delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
891                  (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
892 
893     }
894 
895     if(_periodicity[1]) {
896       delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
897                  (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
898       delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
899                  (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
900 
901     }
902 
903     if(_periodicity[0]) {
904       delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
905                  (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
906     }
907   }
908 }
909 
closest_image(X_FLOAT4 & x1,X_FLOAT4 & x2,X_FLOAT4 & ci)910 static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
911 {
912   ci.x = x2.x - x1.x;
913   ci.y = x2.y - x1.y;
914   ci.z = x2.z - x1.z;
915   minimum_image(ci);
916   ci.x += x1.x;
917   ci.y += x1.y;
918   ci.z += x1.z;
919 }
920