1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24 #ifndef CRM_CUDA_UTILS
25 #define CRM_CUDA_UTILS
26
27 //split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
28 #define MIN(a,b) ((a) < (b) ? (a) : (b))
29 #define MAX(a,b) ((a) > (b) ? (a) : (b))
30
getgrid(int n,int shared_per_thread=0,int threadsmax=256,bool p2=false)31 inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
32 {
33 int3 gridparams;
34 int sharedsize = 16000;
35
36 if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
37
38 if((n < 60 * 32) || (threadsmax < 64))
39 gridparams.z = 32;
40 else if((n < 60 * 64) || (threadsmax < 128))
41 gridparams.z = 64;
42 else if((n < 60 * 128) || (threadsmax < 256))
43 gridparams.z = 128;
44 else if((n < 60 * 256) || (threadsmax < 512))
45 gridparams.z = 256;
46 else gridparams.z = 512;
47
48 if(p2) {
49 gridparams.z = 16;
50
51 while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
52 }
53
54
55 int blocks = (n + gridparams.z - 1) / gridparams.z;
56
57 if(blocks > 10000)
58 gridparams.x = gridparams.y = int(sqrt(blocks));
59 else {
60 gridparams.x = blocks;
61 gridparams.y = 1;
62 }
63
64 while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
65
66 if(gridparams.x == 0) gridparams.x = 1;
67
68 return gridparams;
69 }
70
71 //return value: 1 if f<0; else: 0
72 //take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
negativCUDA(float f)73 static inline __device__ int negativCUDA(float f)
74 {
75 return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
76 }
77
78 //return value: -1 if f<0; else +1
fsignCUDA(float f)79 static inline __device__ float fsignCUDA(float f)
80 {
81 return f < 0.0f ? -1.0f : 1.0f;
82 }
83
84 //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
85 //blockDim.y and blockDim.z are assumed to be 1
copySharedToGlob(int * shared,int * glob,const int & n)86 static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
87 {
88 int i, k;
89 k = n - blockDim.x;
90
91 for(i = 0; i < k; i += blockDim.x) {
92 glob[i + threadIdx.x] = shared[i + threadIdx.x];
93 }
94
95 if(threadIdx.x < n - i) {
96 glob[i + threadIdx.x] = shared[i + threadIdx.x];
97 }
98
99 __syncthreads();
100 }
101
copySharedToGlob(float * shared,float * glob,const int & n)102 static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
103 {
104 int i, k;
105 k = n - blockDim.x;
106
107 for(i = 0; i < k; i += blockDim.x) {
108 glob[i + threadIdx.x] = shared[i + threadIdx.x];
109 }
110
111 if(threadIdx.x < n - i) {
112 glob[i + threadIdx.x] = shared[i + threadIdx.x];
113 }
114
115 __syncthreads();
116 }
117
copySharedToGlob(double * shared,double * glob,const int & n)118 static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
119 {
120 int i, k;
121 k = n - blockDim.x;
122
123 for(i = 0; i < k; i += blockDim.x) {
124 glob[i + threadIdx.x] = shared[i + threadIdx.x];
125 }
126
127 if(threadIdx.x < n - i) {
128 glob[i + threadIdx.x] = shared[i + threadIdx.x];
129 }
130
131 __syncthreads();
132 }
133
copyGlobToShared(int * glob,int * shared,const int & n)134 static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
135 {
136 int i, k;
137 k = n - blockDim.x;
138
139 for(i = 0; i < k; i += blockDim.x) {
140 shared[i + threadIdx.x] = glob[i + threadIdx.x];
141 }
142
143 if(threadIdx.x < n - i) {
144 shared[i + threadIdx.x] = glob[i + threadIdx.x];
145 }
146
147 __syncthreads();
148 }
149
copyGlobToShared(float * glob,float * shared,const int & n)150 static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
151 {
152 int i, k;
153 k = n - blockDim.x;
154
155 for(i = 0; i < k; i += blockDim.x) {
156 shared[i + threadIdx.x] = glob[i + threadIdx.x];
157 }
158
159 if(threadIdx.x < n - i) {
160 shared[i + threadIdx.x] = glob[i + threadIdx.x];
161 }
162
163 __syncthreads();
164 }
165
copyGlobToShared(double * glob,double * shared,const int & n)166 static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
167 {
168 int i;
169
170 for(i = 0; i < n - blockDim.x; i += blockDim.x) {
171 shared[i + threadIdx.x] = glob[i + threadIdx.x];
172 }
173
174 if(threadIdx.x < n - i) {
175 shared[i + threadIdx.x] = glob[i + threadIdx.x];
176 }
177
178 __syncthreads();
179 }
180
181 //copy data between two memory areas on device, 3d BlockDims are allowed
copyData(double * source,double * target,const int & n)182 static __device__ inline void copyData(double* source, double* target, const int &n)
183 {
184 int i;
185 int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
186
187 for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
188 target[i + offset] = source[i + offset];
189 }
190
191 if(offset < n - i) {
192 target[i + offset] = source[i + offset];
193 }
194
195 __syncthreads();
196 }
197
copyData(float * source,float * target,const int & n)198 static __device__ inline void copyData(float* source, float* target, const int &n)
199 {
200 int i;
201 int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
202
203 for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
204 target[i + offset] = source[i + offset];
205 }
206
207 if(offset < n - i) {
208 target[i + offset] = source[i + offset];
209 }
210
211 __syncthreads();
212 }
213
copyData(int * source,int * target,const int & n)214 static __device__ inline void copyData(int* source, int* target, const int &n)
215 {
216 int i;
217 int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
218
219 for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
220 target[i + offset] = source[i + offset];
221 }
222
223 if(offset < n - i) {
224 target[i + offset] = source[i + offset];
225 }
226
227 __syncthreads();
228 }
229
copyData(unsigned int * source,unsigned int * target,const int & n)230 static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
231 {
232 int i;
233 int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
234
235 for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
236 target[i + offset] = source[i + offset];
237 }
238
239 if(offset < n - i) {
240 target[i + offset] = source[i + offset];
241 }
242
243 __syncthreads();
244 }
245
246 //functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
247 //in the end in data[0]=sum_i=0^blockDim.x data[i]
248 //for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
reduceBlockP2(int * data)249 static __device__ inline void reduceBlockP2(int* data)
250 {
251 __syncthreads();
252
253 for(int i = 2; i <= blockDim.x; i *= 2) {
254 if(threadIdx.x < blockDim.x / i)
255 data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
256
257 __syncthreads();
258 }
259 }
260
reduceBlockP2(unsigned int * data)261 static __device__ inline void reduceBlockP2(unsigned int* data)
262 {
263 __syncthreads();
264
265 for(int i = 2; i <= blockDim.x; i *= 2) {
266 if(threadIdx.x < blockDim.x / i)
267 data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
268
269 __syncthreads();
270 }
271 }
272
reduceBlockP2(float * data)273 static __device__ inline void reduceBlockP2(float* data)
274 {
275 __syncthreads();
276
277 for(int i = 2; i <= blockDim.x; i *= 2) {
278 if(threadIdx.x < blockDim.x / i)
279 data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
280
281 __syncthreads();
282 }
283 }
284
reduceBlockP2(double * data)285 static __device__ inline void reduceBlockP2(double* data)
286 {
287 __syncthreads();
288
289 for(int i = 2; i <= blockDim.x; i *= 2) {
290 if(threadIdx.x < blockDim.x / i)
291 data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
292
293 __syncthreads();
294 }
295 }
296
reduceBlock(float * data)297 static __device__ inline void reduceBlock(float* data)
298 {
299 __syncthreads();
300 int p2 = 1;
301
302 while(p2 * 2 < blockDim.x) p2 *= 2;
303
304 if(threadIdx.x < blockDim.x - p2)
305 data[threadIdx.x] += data[threadIdx.x + p2];
306
307 __syncthreads();
308
309 for(int i = 2; i <= p2; i *= 2) {
310 if(threadIdx.x < p2 / i)
311 data[threadIdx.x] += data[threadIdx.x + p2 / i];
312
313 __syncthreads();
314 }
315 }
316
reduceBlock(int * data)317 static __device__ inline void reduceBlock(int* data)
318 {
319 __syncthreads();
320 int p2 = 1;
321
322 while(p2 * 2 < blockDim.x) p2 *= 2;
323
324 if(threadIdx.x < blockDim.x - p2)
325 data[threadIdx.x] += data[threadIdx.x + p2];
326
327 __syncthreads();
328
329 for(int i = 2; i <= p2; i *= 2) {
330 if(threadIdx.x < p2 / i)
331 data[threadIdx.x] += data[threadIdx.x + p2 / i];
332
333 __syncthreads();
334 }
335 }
336
reduceBlock(unsigned int * data)337 static __device__ inline void reduceBlock(unsigned int* data)
338 {
339 __syncthreads();
340 int p2 = 1;
341
342 while(p2 * 2 < blockDim.x) p2 *= 2;
343
344 if(threadIdx.x < blockDim.x - p2)
345 data[threadIdx.x] += data[threadIdx.x + p2];
346
347 __syncthreads();
348
349 for(int i = 2; i <= p2; i *= 2) {
350 if(threadIdx.x < p2 / i)
351 data[threadIdx.x] += data[threadIdx.x + p2 / i];
352
353 __syncthreads();
354 }
355 }
356
reduceBlock(double * data)357 static __device__ inline void reduceBlock(double* data)
358 {
359 __syncthreads();
360 int p2 = 1;
361
362 while(p2 * 2 < blockDim.x) p2 *= 2;
363
364 if(threadIdx.x < blockDim.x - p2)
365 data[threadIdx.x] += data[threadIdx.x + p2];
366
367 __syncthreads();
368
369 for(int i = 2; i <= p2; i *= 2) {
370 if(threadIdx.x < p2 / i)
371 data[threadIdx.x] += data[threadIdx.x + p2 / i];
372
373 __syncthreads();
374 }
375 }
376
cudaFillBlockData_int(int * data,const int & n,const int & value)377 static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
378 {
379 int i;
380
381 for(i = 0; i < n - blockDim.x; i += blockDim.x) {
382 data[i + threadIdx.x] = value;
383 }
384
385 if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
386 }
387
cudaFillBlockData_float(float * data,const int & n,const float & value)388 static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
389 {
390 int i;
391
392 for(i = 0; i < n - blockDim.x; i += blockDim.x) {
393 data[i + threadIdx.x] = value;
394 }
395
396 if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
397 }
398
reduce(float * data,int n)399 static __device__ inline void reduce(float* data, int n) //cautious not sure if working
400 {
401 __syncthreads();
402 int p2 = 1;
403
404 while(p2 * 2 < n) p2 *= 2;
405
406 int j = 0;
407
408 while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
409 data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
410 j++;
411 }
412
413 __syncthreads();
414
415 for(int i = 2; i <= p2; i *= 2) {
416 while((threadIdx.x + blockDim.x * j) < p2 / i) {
417 data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
418 j++;
419 }
420
421 __syncthreads();
422 }
423 }
424
reduce(double * data,int n)425 static __device__ inline void reduce(double* data, int n) //cautious not sure if working
426 {
427 __syncthreads();
428 int p2 = 1;
429
430 while(p2 * 2 < n) p2 *= 2;
431
432 int j = 0;
433
434 while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
435 data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
436 j++;
437 }
438
439 __syncthreads();
440
441 for(int i = 2; i <= p2; i *= 2) {
442 while((threadIdx.x + blockDim.x * j) < p2 / i) {
443 data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
444 j++;
445 }
446
447 __syncthreads();
448 }
449 }
450
minOfBlock(float * data)451 static __device__ inline void minOfBlock(float* data)
452 {
453 __syncthreads();
454 int p2 = 1;
455
456 while(p2 * 2 < blockDim.x) p2 *= 2;
457
458 if(threadIdx.x < blockDim.x - p2)
459 data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
460
461 __syncthreads();
462
463 for(int i = 2; i <= p2; i *= 2) {
464 if(threadIdx.x < p2 / i)
465 data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
466
467 __syncthreads();
468 }
469 }
470
maxOfBlock(float * data)471 static __device__ inline void maxOfBlock(float* data)
472 {
473 __syncthreads();
474 int p2 = 1;
475
476 while(p2 * 2 < blockDim.x) p2 *= 2;
477
478 if(threadIdx.x < blockDim.x - p2)
479 data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
480
481 __syncthreads();
482
483 for(int i = 2; i <= p2; i *= 2) {
484 if(threadIdx.x < p2 / i)
485 data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
486
487 __syncthreads();
488 }
489 }
490
minOfBlock(double * data)491 static __device__ inline void minOfBlock(double* data)
492 {
493 __syncthreads();
494 int p2 = 1;
495
496 while(p2 * 2 < blockDim.x) p2 *= 2;
497
498 if(threadIdx.x < blockDim.x - p2)
499 data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
500
501 __syncthreads();
502
503 for(int i = 2; i <= p2; i *= 2) {
504 if(threadIdx.x < p2 / i)
505 data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
506
507 __syncthreads();
508 }
509 }
510
maxOfBlock(double * data)511 static __device__ inline void maxOfBlock(double* data)
512 {
513 __syncthreads();
514 int p2 = 1;
515
516 while(p2 * 2 < blockDim.x) p2 *= 2;
517
518 if(threadIdx.x < blockDim.x - p2)
519 data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
520
521 __syncthreads();
522
523 for(int i = 2; i <= p2; i *= 2) {
524 if(threadIdx.x < p2 / i)
525 data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
526
527 __syncthreads();
528 }
529 }
530
531
minOfData(double * data,int n)532 static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
533 {
534 __syncthreads();
535 int p2 = 1;
536
537 while(p2 * 2 < n) p2 *= 2;
538
539 int j = 0;
540
541 while((threadIdx.x + blockDim.x * j) < n - p2) {
542 data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
543 j++;
544 }
545
546 __syncthreads();
547
548 for(int i = 2; i <= p2; i *= 2) {
549 while((threadIdx.x + blockDim.x * j) < p2 / i) {
550 data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
551 j++;
552 }
553
554 __syncthreads();
555 }
556 }
557
maxOfData(double * data,int n)558 static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
559 {
560 __syncthreads();
561 int p2 = 1;
562
563 while(p2 * 2 < n) p2 *= 2;
564
565 int j = 0;
566
567 while((threadIdx.x + blockDim.x * j) < n - p2) {
568 data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
569 j++;
570 }
571
572 __syncthreads();
573
574 for(int i = 2; i <= p2; i *= 2) {
575 while((threadIdx.x + blockDim.x * j) < p2 / i) {
576 data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
577 j++;
578 }
579
580 __syncthreads();
581 }
582 }
583
minOfData(float * data,int n)584 static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
585 {
586 __syncthreads();
587 int p2 = 1;
588
589 while(p2 * 2 < n) p2 *= 2;
590
591 int j = 0;
592
593 while((threadIdx.x + blockDim.x * j) < n - p2) {
594 data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
595 j++;
596 }
597
598 __syncthreads();
599
600 for(int i = 2; i <= p2; i *= 2) {
601 while((threadIdx.x + blockDim.x * j) < p2 / i) {
602 data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
603 j++;
604 }
605
606 __syncthreads();
607 }
608 }
609
maxOfData(float * data,int n)610 static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
611 {
612 __syncthreads();
613 int p2 = 1;
614
615 while(p2 * 2 < n) p2 *= 2;
616
617 int j = 0;
618
619 while((threadIdx.x + blockDim.x * j) < n - p2) {
620 data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
621 j++;
622 }
623
624 __syncthreads();
625
626 for(int i = 2; i <= p2; i *= 2) {
627 while((threadIdx.x + blockDim.x * j) < p2 / i) {
628 data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
629 j++;
630 }
631
632 __syncthreads();
633 }
634 }
635
636 #if X_PRECISION == 2
tex1Dfetch_double(texture<int2,1> t,int i)637 static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
638 {
639 int2 v = tex1Dfetch(t, i);
640 return __hiloint2double(v.y, v.x);
641 }
642
tex1Dfetch_double(texture<int4,1> t,int i)643 static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
644 {
645 int4 v = tex1Dfetch(t, 2 * i);
646 int4 u = tex1Dfetch(t, 2 * i + 1);
647 X_FLOAT4 w;
648
649 w.x = __hiloint2double(v.y, v.x);
650 w.y = __hiloint2double(v.w, v.z);
651 w.z = __hiloint2double(u.y, u.x);
652 w.w = __hiloint2double(u.w, u.z);
653 return w;
654 }
655 #endif
656
BindXTypeTexture(cuda_shared_data * sdata)657 inline void BindXTypeTexture(cuda_shared_data* sdata)
658 {
659 #ifdef CUDA_USE_TEXTURE
660 _x_type_tex.normalized = false; // access with normalized texture coordinates
661 _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
662 _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
663 const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
664
665 #if X_PRECISION == 1
666 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
667 cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
668 #else
669 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
670 cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
671 #endif
672 #endif
673 }
674
fetchXType(int i)675 static __device__ inline X_FLOAT4 fetchXType(int i)
676 {
677 #ifdef CUDA_USE_TEXTURE
678 #if X_PRECISION == 1
679 return tex1Dfetch(_x_type_tex, i);
680 #else
681 return tex1Dfetch_double(_x_type_tex, i);
682 #endif
683 #else
684 return _x_type[i];
685 #endif
686 }
687
688 #if V_PRECISION == 2
tex1Dfetch_double_v(texture<int2,1> t,int i)689 static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
690 {
691 int2 v = tex1Dfetch(t, i);
692 return __hiloint2double(v.y, v.x);
693 }
694
tex1Dfetch_double_v(texture<int4,1> t,int i)695 static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
696 {
697 int4 v = tex1Dfetch(t, 2 * i);
698 int4 u = tex1Dfetch(t, 2 * i + 1);
699 V_FLOAT4 w;
700
701 w.x = __hiloint2double(v.y, v.x);
702 w.y = __hiloint2double(v.w, v.z);
703 w.z = __hiloint2double(u.y, u.x);
704 w.w = __hiloint2double(u.w, u.z);
705 return w;
706 }
707 #endif
708
BindVRadiusTexture(cuda_shared_data * sdata)709 inline void BindVRadiusTexture(cuda_shared_data* sdata)
710 {
711 #ifdef CUDA_USE_TEXTURE
712 _v_radius_tex.normalized = false; // access with normalized texture coordinates
713 _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
714 _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
715 const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
716
717 #if V_PRECISION == 1
718 cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
719 cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
720 #else
721 cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
722 cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
723 #endif
724 #endif
725 }
726
fetchVRadius(int i)727 static __device__ inline V_FLOAT4 fetchVRadius(int i)
728 {
729 #ifdef CUDA_USE_TEXTURE
730 #if V_PRECISION == 1
731 return tex1Dfetch(_v_radius_tex, i);
732 #else
733 return tex1Dfetch_double_v(_v_radius_tex, i);
734 #endif
735 #else
736 return _v_radius[i];
737 #endif
738 }
739
BindOmegaRmassTexture(cuda_shared_data * sdata)740 inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
741 {
742 #ifdef CUDA_USE_TEXTURE
743 _omega_rmass_tex.normalized = false; // access with normalized texture coordinates
744 _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
745 _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
746 const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
747
748 #if V_PRECISION == 1
749 cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
750 cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
751 #else
752 cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
753 cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
754 #endif
755 #endif
756 }
757
fetchOmegaRmass(int i)758 static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
759 {
760 #ifdef CUDA_USE_TEXTURE
761 #if V_PRECISION == 1
762 return tex1Dfetch(_omega_rmass_tex, i);
763 #else
764 return tex1Dfetch_double_v(_omega_rmass_tex, i);
765 #endif
766 #else
767 return _omega_rmass[i];
768 #endif
769 }
770
771 #if F_PRECISION == 2
tex1Dfetch_double_f(texture<int2,1> t,int i)772 static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
773 {
774 int2 v = tex1Dfetch(t, i);
775 return __hiloint2double(v.y, v.x);
776 }
777
tex1Dfetch_double_f(texture<int4,1> t,int i)778 static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
779 {
780 int4 v = tex1Dfetch(t, 2 * i);
781 int4 u = tex1Dfetch(t, 2 * i + 1);
782 F_FLOAT4 w;
783
784 w.x = __hiloint2double(v.y, v.x);
785 w.y = __hiloint2double(v.w, v.z);
786 w.z = __hiloint2double(u.y, u.x);
787 w.w = __hiloint2double(u.w, u.z);
788 return w;
789 }
790 #endif
791
BindQTexture(cuda_shared_data * sdata)792 inline void BindQTexture(cuda_shared_data* sdata)
793 {
794 #ifdef CUDA_USE_TEXTURE
795 _q_tex.normalized = false; // access with normalized texture coordinates
796 _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
797 _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
798 const textureReference* q_texture_ptr = &MY_AP(q_tex);
799
800 #if F_PRECISION == 1
801 cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
802 cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
803 #else
804 cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
805 cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
806 #endif
807 #endif
808 }
809
fetchQ(int i)810 static __device__ inline F_FLOAT fetchQ(int i)
811 {
812 #ifdef CUDA_USE_TEXTURE
813 #if F_PRECISION == 1
814 return tex1Dfetch(_q_tex, i);
815 #else
816 return tex1Dfetch_double_f(_q_tex, i);
817 #endif
818 #else
819 return _q[i];
820 #endif
821 }
822
823 #endif
824
825 /*
826
827 inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
828 {
829 #ifdef CUDA_USE_TEXTURE
830 _coeff_tex.normalized = false; // access with normalized texture coordinates
831 _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
832 _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
833 const textureReference* coeff_texture_ptr;
834 cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
835
836 #if F_PRECISION == 1
837 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
838 cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
839 #else
840 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
841 cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
842 #endif
843 #endif
844 }
845
846 static __device__ inline X_FLOAT4 fetchXType(int i)
847 {
848 #ifdef CUDA_USE_TEXTURE
849 #if X_PRECISION == 1
850 return tex1Dfetch(_x_type_tex,i);
851 #else
852 return tex1Dfetch_double(_x_type_tex,i);
853 #endif
854 #else
855 return _x_type[i];
856 #endif
857 }
858 */
859 #define SBBITS 30
860
sbmask(int j)861 static inline __device__ int sbmask(int j)
862 {
863 return j >> SBBITS & 3;
864 }
865
minimum_image(X_FLOAT4 & delta)866 static inline __device__ void minimum_image(X_FLOAT4 &delta)
867 {
868 if(_triclinic == 0) {
869 if(_periodicity[0]) {
870 delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
871 (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
872 }
873
874 if(_periodicity[1]) {
875 delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
876 (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
877 }
878
879 if(_periodicity[2]) {
880 delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
881 (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
882 }
883
884 } else {
885 if(_periodicity[1]) {
886 delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
887 (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
888 delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
889 (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
890 delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
891 (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
892
893 }
894
895 if(_periodicity[1]) {
896 delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
897 (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
898 delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
899 (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
900
901 }
902
903 if(_periodicity[0]) {
904 delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
905 (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
906 }
907 }
908 }
909
closest_image(X_FLOAT4 & x1,X_FLOAT4 & x2,X_FLOAT4 & ci)910 static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
911 {
912 ci.x = x2.x - x1.x;
913 ci.y = x2.y - x1.y;
914 ci.z = x2.z - x1.z;
915 minimum_image(ci);
916 ci.x += x1.x;
917 ci.y += x1.y;
918 ci.z += x1.z;
919 }
920