1 // **************************************************************************
2 // ellipsoid_nbor.cu
3 // -------------------
4 // W. Michael Brown (ORNL)
5 //
6 // Device code for Ellipsoid neighbor routines
7 //
8 // __________________________________________________________________________
9 // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 // begin :
13 // email : brownw@ornl.gov
14 // ***************************************************************************/
15
16 #ifdef NV_KERNEL
17 #include "lal_preprocessor.h"
18 #ifndef _DOUBLE_DOUBLE
19 texture<float4> pos_tex;
20 #else
21 texture<int4,1> pos_tex;
22 #endif
23 #else
24 #define pos_tex x_
25 #endif
26
27 // ---------------------------------------------------------------------------
28 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
29 // -- Only unpack neighbors matching the specified inclusive range of forms
30 // -- Only unpack neighbors within cutoff
31 // ---------------------------------------------------------------------------
kernel_nbor(const __global numtyp4 * restrict x_,const __global numtyp2 * restrict cut_form,const int ntypes,__global int * dev_nbor,const int nbor_pitch,const int start,const int inum,const __global int * dev_ij,const int form_low,const int form_high)32 __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
33 const __global numtyp2 *restrict cut_form,
34 const int ntypes,
35 __global int *dev_nbor,
36 const int nbor_pitch, const int start, const int inum,
37 const __global int *dev_ij,
38 const int form_low, const int form_high) {
39
40 // ii indexes the two interacting particles in gi
41 int ii=GLOBAL_ID_X+start;
42
43 if (ii<inum) {
44 const __global int *nbor=dev_ij+ii;
45 int i=*nbor;
46 nbor+=nbor_pitch;
47 int numj=*nbor;
48 nbor+=nbor_pitch;
49 const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
50 __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
51
52 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
53 int iw=ix.w;
54 int itype=fast_mul(iw,ntypes);
55 int newj=0;
56 for ( ; nbor<list_end; nbor+=nbor_pitch) {
57 int j=*nbor;
58 j &= NEIGHMASK;
59 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
60 int jtype=jx.w;
61 int mtype=itype+jtype;
62 numtyp2 cf=cut_form[mtype];
63 if (cf.y>=form_low && cf.y<=form_high) {
64 // Compute r12;
65 numtyp rsq=jx.x-ix.x;
66 rsq*=rsq;
67 numtyp t=jx.y-ix.y;
68 rsq+=t*t;
69 t=jx.z-ix.z;
70 rsq+=t*t;
71
72 if (rsq<cf.x) {
73 *packed=j;
74 packed+=nbor_pitch;
75 newj++;
76 }
77 }
78 }
79 dev_nbor[ii+nbor_pitch]=newj;
80 }
81 }
82
83 // ---------------------------------------------------------------------------
84 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
85 // -- Only unpack neighbors matching the specified inclusive range of forms
86 // -- Only unpack neighbors within cutoff
87 // -- Fast version of routine that uses shared memory for LJ constants
88 // ---------------------------------------------------------------------------
kernel_nbor_fast(const __global numtyp4 * restrict x_,const __global numtyp2 * restrict cut_form,__global int * dev_nbor,const int nbor_pitch,const int start,const int inum,const __global int * dev_ij,const int form_low,const int form_high)89 __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
90 const __global numtyp2 *restrict cut_form,
91 __global int *dev_nbor,
92 const int nbor_pitch, const int start,
93 const int inum,
94 const __global int *dev_ij,
95 const int form_low, const int form_high) {
96
97 int ii=THREAD_ID_X;
98 __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
99 __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
100 if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
101 cutsq[ii]=cut_form[ii].x;
102 form[ii]=cut_form[ii].y;
103 }
104 ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
105 __syncthreads();
106
107 if (ii<inum) {
108 const __global int *nbor=dev_ij+ii;
109 int i=*nbor;
110 nbor+=nbor_pitch;
111 int numj=*nbor;
112 nbor+=nbor_pitch;
113 const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
114 __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
115
116 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
117 int iw=ix.w;
118 int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
119
120 int newj=0;
121 for ( ; nbor<list_end; nbor+=nbor_pitch) {
122 int j=*nbor;
123 j &= NEIGHMASK;
124 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
125 int jtype=jx.w;
126 int mtype=itype+jtype;
127
128 if (form[mtype]>=form_low && form[mtype]<=form_high) {
129 // Compute r12;
130 numtyp rsq=jx.x-ix.x;
131 rsq*=rsq;
132 numtyp t=jx.y-ix.y;
133 rsq+=t*t;
134 t=jx.z-ix.z;
135 rsq+=t*t;
136
137 if (rsq<cutsq[mtype]) {
138 *packed=j;
139 packed+=nbor_pitch;
140 newj++;
141 }
142 }
143 }
144 dev_nbor[ii+nbor_pitch]=newj;
145 }
146 }
147