1 // **************************************************************************
2 //                              ellipsoid_nbor.cu
3 //                             -------------------
4 //                           W. Michael Brown (ORNL)
5 //
6 //  Device code for Ellipsoid neighbor routines
7 //
8 // __________________________________________________________________________
9 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 //    begin                :
13 //    email                : brownw@ornl.gov
14 // ***************************************************************************/
15 
16 #ifdef NV_KERNEL
17 #include "lal_preprocessor.h"
18 #ifndef _DOUBLE_DOUBLE
19 texture<float4> pos_tex;
20 #else
21 texture<int4,1> pos_tex;
22 #endif
23 #else
24 #define pos_tex x_
25 #endif
26 
27 // ---------------------------------------------------------------------------
28 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
29 // -- Only unpack neighbors matching the specified inclusive range of forms
30 // -- Only unpack neighbors within cutoff
31 // ---------------------------------------------------------------------------
kernel_nbor(const __global numtyp4 * restrict x_,const __global numtyp2 * restrict cut_form,const int ntypes,__global int * dev_nbor,const int nbor_pitch,const int start,const int inum,const __global int * dev_ij,const int form_low,const int form_high)32 __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
33                           const __global numtyp2 *restrict cut_form,
34                           const int ntypes,
35                           __global int *dev_nbor,
36                           const int nbor_pitch, const int start, const int inum,
37                           const __global int *dev_ij,
38                           const int form_low, const int form_high) {
39 
40   // ii indexes the two interacting particles in gi
41   int ii=GLOBAL_ID_X+start;
42 
43   if (ii<inum) {
44     const __global int *nbor=dev_ij+ii;
45     int i=*nbor;
46     nbor+=nbor_pitch;
47     int numj=*nbor;
48     nbor+=nbor_pitch;
49     const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
50     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
51 
52     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
53     int iw=ix.w;
54     int itype=fast_mul(iw,ntypes);
55     int newj=0;
56     for ( ; nbor<list_end; nbor+=nbor_pitch) {
57       int j=*nbor;
58       j &= NEIGHMASK;
59       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
60       int jtype=jx.w;
61       int mtype=itype+jtype;
62       numtyp2 cf=cut_form[mtype];
63       if (cf.y>=form_low && cf.y<=form_high) {
64         // Compute r12;
65         numtyp rsq=jx.x-ix.x;
66         rsq*=rsq;
67         numtyp t=jx.y-ix.y;
68         rsq+=t*t;
69         t=jx.z-ix.z;
70         rsq+=t*t;
71 
72         if (rsq<cf.x) {
73           *packed=j;
74           packed+=nbor_pitch;
75           newj++;
76         }
77       }
78     }
79     dev_nbor[ii+nbor_pitch]=newj;
80   }
81 }
82 
83 // ---------------------------------------------------------------------------
84 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
85 // -- Only unpack neighbors matching the specified inclusive range of forms
86 // -- Only unpack neighbors within cutoff
87 // -- Fast version of routine that uses shared memory for LJ constants
88 // ---------------------------------------------------------------------------
kernel_nbor_fast(const __global numtyp4 * restrict x_,const __global numtyp2 * restrict cut_form,__global int * dev_nbor,const int nbor_pitch,const int start,const int inum,const __global int * dev_ij,const int form_low,const int form_high)89 __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
90                                const __global numtyp2 *restrict cut_form,
91                                __global int *dev_nbor,
92                                const int nbor_pitch, const int start,
93                                const int inum,
94                                const __global int *dev_ij,
95                                const int form_low, const int form_high) {
96 
97   int ii=THREAD_ID_X;
98   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
99   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
100   if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
101     cutsq[ii]=cut_form[ii].x;
102     form[ii]=cut_form[ii].y;
103   }
104   ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
105   __syncthreads();
106 
107   if (ii<inum) {
108     const __global int *nbor=dev_ij+ii;
109     int i=*nbor;
110     nbor+=nbor_pitch;
111     int numj=*nbor;
112     nbor+=nbor_pitch;
113     const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
114     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
115 
116     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
117     int iw=ix.w;
118     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
119 
120     int newj=0;
121     for ( ; nbor<list_end; nbor+=nbor_pitch) {
122       int j=*nbor;
123       j &= NEIGHMASK;
124       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
125       int jtype=jx.w;
126       int mtype=itype+jtype;
127 
128       if (form[mtype]>=form_low && form[mtype]<=form_high) {
129         // Compute r12;
130         numtyp rsq=jx.x-ix.x;
131         rsq*=rsq;
132         numtyp t=jx.y-ix.y;
133         rsq+=t*t;
134         t=jx.z-ix.z;
135         rsq+=t*t;
136 
137         if (rsq<cutsq[mtype]) {
138           *packed=j;
139           packed+=nbor_pitch;
140           newj++;
141         }
142       }
143     }
144     dev_nbor[ii+nbor_pitch]=newj;
145   }
146 }
147