1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 #define SBBITS 30
25 
Binning_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z,CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z)26 __global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
27                                CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
28 {
29   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
30 
31   /*int* bin_count=(int*) _buffer;
32   bin_count=bin_count+20;
33   CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
34   CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
35   binned_x = &binned_x[2];
36   int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
37 
38   if(i < _nall) {
39     // copy atom position from global device memory to local register
40     // in this 3 steps to get as much coalesced access as possible
41     X_FLOAT* my_x = _x + i;
42     CUDA_FLOAT x_i = *my_x;
43     my_x += _nmax;
44     CUDA_FLOAT y_i = *my_x;
45     my_x += _nmax;
46     CUDA_FLOAT z_i = *my_x;
47 
48 
49     // calculate flat bin index
50     int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
51     int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
52     int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
53 
54     bx -= bx * negativCUDA(1.0f * bx);
55     bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
56     by -= by * negativCUDA(1.0f * by);
57     by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
58     bz -= bz * negativCUDA(1.0f * bz);
59     bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
60 
61 
62     const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
63 
64     // add new atom to bin, get bin-array position
65     const unsigned k = atomicAdd(& bin_count[j], 1);
66 
67     if(k < bin_nmax) {
68       binned_id [bin_nmax * j + k] = i;
69       binned_x [3 * bin_nmax * j + k] = x_i;
70       binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
71       binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
72     } else {
73       // normally, this should not happen:
74       int errorn = atomicAdd((int*) _buffer, 1);
75       MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
76     }
77   }
78 }
79 
80 
exclusion(int & i,int & j,int & itype,int & jtype)81 __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
82 {
83   int m;
84 
85   if(_nex_type)
86     if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
87 
88   if(_nex_group) {
89     for(m = 0; m < _nex_group; m++) {
90       if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
91 
92       if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
93     }
94   }
95 
96   if(_nex_mol) {
97     if(_molecule[i] == _molecule[j])
98       for(m = 0; m < _nex_mol; m++)
99         if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
100   }
101 
102   return 0;
103 }
104 
105 extern __shared__ CUDA_FLOAT shared[];
106 
find_special(int3 & n,int * list,int & tag,int3 flag)107 __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
108 {
109   int k = n.z;
110 
111   for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
112 
113   return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
114 }
115 
116 template <const unsigned int exclude>
NeighborBuildFullBin_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style,bool neighall)117 __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
118 {
119   int natoms = neighall ? _nall : _nlocal;
120   //const bool domol=false;
121   int bin_dim_z = gridDim.y;
122   CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
123   binned_x = &binned_x[2];
124   int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
125   int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
126   int bin_x = blockIdx.x / bin_dim_y;
127   int bin_y = blockIdx.x - bin_x * bin_dim_y;
128   int bin_z = blockIdx.y;
129   int bin_c = bin_count[bin];
130 
131 
132   CUDA_FLOAT cut;
133 
134   if(globcutoff > 0)
135     cut = globcutoff;
136 
137   int i = _nall;
138   CUDA_FLOAT* my_x;
139   CUDA_FLOAT x_i, y_i, z_i;
140 
141   for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
142 
143     int actIdx = threadIdx.x + actOffset;
144     CUDA_FLOAT* other_x = shared;
145     int* other_id = (int*) &other_x[3 * blockDim.x];
146 
147     if(actIdx < bin_c) {
148       i = binned_id[__mul24(bin, bin_nmax) + actIdx];
149       my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
150       x_i = *my_x;
151       my_x += bin_nmax;
152       y_i = *my_x;
153       my_x += bin_nmax;
154       z_i = *my_x;
155     } else
156       i = 2 * _nall;
157 
158     __syncthreads();
159 
160     int jnum = 0;
161     int itype;
162 
163     if(i < natoms) {
164       jnum = 0;
165       _ilist[i] = i;
166       itype = _type[i];
167     }
168 
169     //__syncthreads();
170 
171 
172     for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
173       int otherActIdx = threadIdx.x + otherActOffset;
174 
175       if(otherActIdx < bin_c) {
176         if(otherActOffset == actOffset) {
177           other_id[threadIdx.x] = i;
178           other_x[threadIdx.x] = x_i;
179           other_x[threadIdx.x + blockDim.x] = y_i;
180           other_x[threadIdx.x + 2 * blockDim.x] = z_i;
181         } else {
182           other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
183           my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
184           other_x[threadIdx.x] = *my_x;
185           my_x += bin_nmax;
186           other_x[threadIdx.x + blockDim.x] = *my_x;
187           my_x += bin_nmax;
188           other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
189 
190         }
191       }
192 
193       __syncthreads();
194       int kk = threadIdx.x;
195 
196       for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
197         if(i < natoms) {
198           kk++;
199           kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
200           int j = other_id[kk];
201 
202           if(exclude && exclusion(i, j, itype, _type[j])) continue;
203 
204           if(globcutoff < 0) {
205             int jtype = _type[j];
206             cut = _cutneighsq[itype * _cuda_ntypes + jtype];
207           }
208 
209           CUDA_FLOAT delx = x_i - other_x[kk];
210           CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
211           CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
212           CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
213 
214 
215           if(rsq <= cut && i != j) {
216             if(jnum < _maxneighbors) {
217               if(block_style)
218                 _neighbors[i * _maxneighbors + jnum] = j;
219               else
220                 _neighbors[i + jnum * natoms] = j;
221             }
222 
223             ++jnum;
224           }
225         }
226       }
227 
228       __syncthreads();
229 
230     }
231 
232     for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
233       for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
234         for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
235           if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
236 
237           if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
238 
239           int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
240 
241           if(other_bin == bin) continue;
242 
243           int obin_c = bin_count[other_bin];
244 
245           for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
246             int otherActIdx = otherActOffset + threadIdx.x;
247 
248             if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
249               other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
250               my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
251               other_x[threadIdx.x] = *my_x;
252               my_x += bin_nmax;
253               other_x[threadIdx.x + blockDim.x] = *my_x;
254               my_x += bin_nmax;
255               other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
256             }
257 
258             __syncthreads();
259 
260             for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
261               if(i < natoms) {
262                 int j = other_id[k];
263 
264                 if(exclude && exclusion(i, j, itype, _type[j])) continue;
265 
266                 if(globcutoff < 0) {
267                   int jtype = _type[j];
268                   cut = _cutneighsq[itype * _cuda_ntypes + jtype];
269                 }
270 
271                 CUDA_FLOAT delx = x_i - other_x[k];
272                 CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
273                 CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
274                 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
275 
276                 if(rsq <= cut && i != j) {
277                   if(jnum < _maxneighbors) {
278                     if(block_style)
279                       _neighbors[i * _maxneighbors + jnum] = j;
280                     else
281                       _neighbors[i + jnum * natoms] = j;
282                   }
283 
284                   ++jnum;
285                 }
286               }
287             }
288 
289             __syncthreads();
290 
291           }
292         }
293 
294     if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
295 
296     if(i < natoms)
297       _numneigh[i] = jnum;
298   }
299 }
300 
301 
FindSpecial(int block_style)302 __global__ void FindSpecial(int block_style)
303 {
304   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
305   int which;
306   int tag_mask = 0;
307   int3 spec_flag;
308 
309   int3 mynspecial = {0, 0, 1};
310 
311   if(ii >= _nlocal) return;
312 
313   int special_id[CUDA_MAX_NSPECIAL];
314 
315   int i = _ilist[ii];
316 
317   if(i >= _nlocal) return;
318 
319   int jnum = _numneigh[i];
320 
321   if(_special_flag[1] == 0) spec_flag.x = -1;
322   else if(_special_flag[1] == 1) spec_flag.x = 0;
323   else spec_flag.x = 1;
324 
325   if(_special_flag[2] == 0) spec_flag.y = -1;
326   else if(_special_flag[2] == 1) spec_flag.y = 0;
327   else spec_flag.y = 2;
328 
329   if(_special_flag[3] == 0) spec_flag.z = -1;
330   else if(_special_flag[3] == 1) spec_flag.z = 0;
331   else spec_flag.z = 3;
332 
333   mynspecial.x = _nspecial[i];
334   mynspecial.y = _nspecial[i + _nmax];
335   mynspecial.z = _nspecial[i + 2 * _nmax];
336 
337   if(i < _nlocal) {
338     int* list = &_special[i];
339 
340     for(int k = 0; k < mynspecial.z; k++) {
341       special_id[k] = list[k * _nmax];
342       tag_mask = tag_mask | special_id[k];
343     }
344   }
345 
346 
347   for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
348     int j;
349 
350     if(block_style)
351       j = _neighbors[i * _maxneighbors + k];
352     else
353       j = _neighbors[i + k * _nlocal];
354 
355     int tag_j = _tag[j];
356     which = 0;
357 
358     if((tag_mask & tag_j) == tag_j) {
359       which = find_special(mynspecial, special_id, tag_j, spec_flag);
360 
361       if(which > 0) {
362         if(block_style)
363           _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
364         else
365           _neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
366       } else if(which < 0) {
367         if(block_style)
368           _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
369         else
370           _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
371 
372         jnum--;
373         k--;
374       }
375     }
376   }
377 
378   _numneigh[i] = jnum;
379 }
380 
NeighborBuildFullBin_OverlapComm_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)381 __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
382 {
383   int bin_dim_z = gridDim.y;
384   CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
385   binned_x = &binned_x[2];
386   int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
387   int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
388   int bin_x = blockIdx.x / bin_dim_y;
389   int bin_y = blockIdx.x - bin_x * bin_dim_y;
390   int bin_z = blockIdx.y;
391   int bin_c = bin_count[bin];
392 
393 
394   CUDA_FLOAT cut;
395 
396   if(globcutoff > 0)
397     cut = globcutoff;
398 
399   int i = _nall;
400   CUDA_FLOAT* my_x;
401   CUDA_FLOAT x_i, y_i, z_i;
402 
403   for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
404 
405     int actIdx = threadIdx.x + actOffset;
406     CUDA_FLOAT* other_x = shared;
407     int* other_id = (int*) &other_x[3 * blockDim.x];
408 
409     if(actIdx < bin_c) {
410       i = binned_id[__mul24(bin, bin_nmax) + actIdx];
411       my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
412       x_i = *my_x;
413       my_x += bin_nmax;
414       y_i = *my_x;
415       my_x += bin_nmax;
416       z_i = *my_x;
417     } else
418       i = 2 * _nall;
419 
420     __syncthreads();
421 
422     int jnum = 0;
423     int jnum_border = 0;
424     int jnum_inner = 0;
425     int i_border = -1;
426     int itype;
427 
428     if(i < _nlocal) {
429       jnum = 0;
430       _ilist[i] = i;
431       itype = _type[i];
432     }
433 
434     __syncthreads();
435 
436 
437     for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
438       int otherActIdx = threadIdx.x + otherActOffset;
439 
440       if(otherActIdx < bin_c) {
441         if(otherActOffset == actOffset) {
442           other_id[threadIdx.x] = i;
443           other_x[threadIdx.x] = x_i;
444           other_x[threadIdx.x + blockDim.x] = y_i;
445           other_x[threadIdx.x + 2 * blockDim.x] = z_i;
446         } else {
447           other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
448           my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
449           other_x[threadIdx.x] = *my_x;
450           my_x += bin_nmax;
451           other_x[threadIdx.x + blockDim.x] = *my_x;
452           my_x += bin_nmax;
453           other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
454 
455         }
456       }
457 
458       __syncthreads();
459       int kk = threadIdx.x;
460 
461       for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
462         if(i < _nlocal) {
463           kk++;
464           kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
465           int j = other_id[kk];
466 
467           if(globcutoff < 0) {
468             int jtype = _type[j];
469             cut = _cutneighsq[itype * _cuda_ntypes + jtype];
470           }
471 
472           CUDA_FLOAT delx = x_i - other_x[kk];
473           CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
474           CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
475           CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
476 
477 
478           if(rsq <= cut && i != j) {
479             if((j >= _nlocal) && (i_border < 0))
480               i_border = atomicAdd(_inum_border, 1);
481 
482             if(jnum < _maxneighbors) {
483               if(block_style) {
484                 _neighbors[i * _maxneighbors + jnum] = j;
485 
486                 if(j >= _nlocal) {
487                   _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
488                 } else {
489                   _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
490                 }
491               } else {
492                 _neighbors[i + jnum * _nlocal] = j;
493 
494                 if(j >= _nlocal) {
495                   _neighbors_border[i_border + jnum_border * _nlocal] = j;
496                 } else {
497                   _neighbors_inner[i + jnum_inner * _nlocal] = j;
498                 }
499               }
500             }
501 
502             ++jnum;
503 
504             if(j >= _nlocal)
505               jnum_border++;
506             else
507               jnum_inner++;
508           }
509         }
510       }
511 
512       __syncthreads();
513     }
514 
515     for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
516       for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
517         for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
518           if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
519 
520           if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
521 
522           int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
523 
524           if(other_bin == bin) continue;
525 
526           int obin_c = bin_count[other_bin];
527 
528           for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
529             int otherActIdx = otherActOffset + threadIdx.x;
530 
531             if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
532               other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
533               my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
534               other_x[threadIdx.x] = *my_x;
535               my_x += bin_nmax;
536               other_x[threadIdx.x + blockDim.x] = *my_x;
537               my_x += bin_nmax;
538               other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
539             }
540 
541             __syncthreads();
542 
543             for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
544               if(i < _nlocal) {
545                 int j = other_id[k];
546 
547                 if(globcutoff < 0) {
548                   int jtype = _type[j];
549                   cut = _cutneighsq[itype * _cuda_ntypes + jtype];
550                 }
551 
552                 CUDA_FLOAT delx = x_i - other_x[k];
553                 CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
554                 CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
555                 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
556 
557                 if(rsq <= cut && i != j) {
558                   if((j >= _nlocal) && (i_border < 0))
559                     i_border = atomicAdd(_inum_border, 1);
560 
561                   if(jnum < _maxneighbors) {
562                     if(block_style) {
563                       _neighbors[i * _maxneighbors + jnum] = j;
564 
565                       if(j >= _nlocal) {
566                         _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
567                       } else {
568                         _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
569                       }
570                     } else {
571                       _neighbors[i + jnum * _nlocal] = j;
572 
573                       if(j >= _nlocal) {
574                         _neighbors_border[i_border + jnum_border * _nlocal] = j;
575                       } else {
576                         _neighbors_inner[i + jnum_inner * _nlocal] = j;
577                       }
578                     }
579                   }
580 
581                   ++jnum;
582 
583                   if(j >= _nlocal)
584                     jnum_border++;
585                   else
586                     jnum_inner++;
587                 }
588               }
589             }
590 
591             __syncthreads();
592           }
593         }
594 
595     if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
596 
597     if(i < _nlocal) {
598       _numneigh[i] = jnum;
599       _numneigh_inner[i] = jnum_inner;
600 
601       if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
602 
603       if(i_border >= 0) _ilist_border[i_border] = i;
604 
605     }
606   }
607 }
608 
NeighborBuildFullNsq_Kernel()609 __global__ void NeighborBuildFullNsq_Kernel()
610 {
611   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
612   int* buffer = (int*) _buffer;
613 
614   if(i < _nlocal) {
615     X_FLOAT* my_x = _x + i;
616     CUDA_FLOAT x_i = *my_x;
617     my_x += _nmax;
618     CUDA_FLOAT y_i = *my_x;
619     my_x += _nmax;
620     CUDA_FLOAT z_i = *my_x;
621     int jnum = 0;
622     int* jlist = _firstneigh[i];
623     _ilist[i] = i;
624 
625     int itype = _type[i];
626     __syncthreads();
627 
628     for(int j = 0; j < _nall; ++j) {
629       my_x = _x + j;
630       CUDA_FLOAT x_j = *my_x;
631       my_x += _nmax;
632       CUDA_FLOAT y_j = *my_x;
633       my_x += _nmax;
634       CUDA_FLOAT z_j = *my_x;
635       CUDA_FLOAT delx = x_i - x_j;
636       CUDA_FLOAT dely = y_i - y_j;
637       CUDA_FLOAT delz = z_i - z_j;
638       CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
639       int jtype = _type[j];
640 
641       if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
642         if(jnum < _maxneighbors)
643           jlist[jnum] = j;
644 
645         if(i == 151)((int*)_buffer)[jnum + 2] = j;
646 
647         ++jnum;
648       }
649 
650       __syncthreads();
651     }
652 
653     if(jnum > _maxneighbors) buffer[0] = 0;
654 
655     _numneigh[i] = jnum;
656 
657     if(i == 151)((int*)_buffer)[1] = jnum;
658   }
659 }
660 
661