1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24 #define SBBITS 30
25
Binning_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z,CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z)26 __global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
27 CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
28 {
29 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
30
31 /*int* bin_count=(int*) _buffer;
32 bin_count=bin_count+20;
33 CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
34 CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
35 binned_x = &binned_x[2];
36 int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
37
38 if(i < _nall) {
39 // copy atom position from global device memory to local register
40 // in this 3 steps to get as much coalesced access as possible
41 X_FLOAT* my_x = _x + i;
42 CUDA_FLOAT x_i = *my_x;
43 my_x += _nmax;
44 CUDA_FLOAT y_i = *my_x;
45 my_x += _nmax;
46 CUDA_FLOAT z_i = *my_x;
47
48
49 // calculate flat bin index
50 int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
51 int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
52 int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
53
54 bx -= bx * negativCUDA(1.0f * bx);
55 bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
56 by -= by * negativCUDA(1.0f * by);
57 by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
58 bz -= bz * negativCUDA(1.0f * bz);
59 bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
60
61
62 const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
63
64 // add new atom to bin, get bin-array position
65 const unsigned k = atomicAdd(& bin_count[j], 1);
66
67 if(k < bin_nmax) {
68 binned_id [bin_nmax * j + k] = i;
69 binned_x [3 * bin_nmax * j + k] = x_i;
70 binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
71 binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
72 } else {
73 // normally, this should not happen:
74 int errorn = atomicAdd((int*) _buffer, 1);
75 MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
76 }
77 }
78 }
79
80
exclusion(int & i,int & j,int & itype,int & jtype)81 __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
82 {
83 int m;
84
85 if(_nex_type)
86 if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
87
88 if(_nex_group) {
89 for(m = 0; m < _nex_group; m++) {
90 if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
91
92 if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
93 }
94 }
95
96 if(_nex_mol) {
97 if(_molecule[i] == _molecule[j])
98 for(m = 0; m < _nex_mol; m++)
99 if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
100 }
101
102 return 0;
103 }
104
105 extern __shared__ CUDA_FLOAT shared[];
106
find_special(int3 & n,int * list,int & tag,int3 flag)107 __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
108 {
109 int k = n.z;
110
111 for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
112
113 return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
114 }
115
116 template <const unsigned int exclude>
NeighborBuildFullBin_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style,bool neighall)117 __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
118 {
119 int natoms = neighall ? _nall : _nlocal;
120 //const bool domol=false;
121 int bin_dim_z = gridDim.y;
122 CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
123 binned_x = &binned_x[2];
124 int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
125 int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
126 int bin_x = blockIdx.x / bin_dim_y;
127 int bin_y = blockIdx.x - bin_x * bin_dim_y;
128 int bin_z = blockIdx.y;
129 int bin_c = bin_count[bin];
130
131
132 CUDA_FLOAT cut;
133
134 if(globcutoff > 0)
135 cut = globcutoff;
136
137 int i = _nall;
138 CUDA_FLOAT* my_x;
139 CUDA_FLOAT x_i, y_i, z_i;
140
141 for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
142
143 int actIdx = threadIdx.x + actOffset;
144 CUDA_FLOAT* other_x = shared;
145 int* other_id = (int*) &other_x[3 * blockDim.x];
146
147 if(actIdx < bin_c) {
148 i = binned_id[__mul24(bin, bin_nmax) + actIdx];
149 my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
150 x_i = *my_x;
151 my_x += bin_nmax;
152 y_i = *my_x;
153 my_x += bin_nmax;
154 z_i = *my_x;
155 } else
156 i = 2 * _nall;
157
158 __syncthreads();
159
160 int jnum = 0;
161 int itype;
162
163 if(i < natoms) {
164 jnum = 0;
165 _ilist[i] = i;
166 itype = _type[i];
167 }
168
169 //__syncthreads();
170
171
172 for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
173 int otherActIdx = threadIdx.x + otherActOffset;
174
175 if(otherActIdx < bin_c) {
176 if(otherActOffset == actOffset) {
177 other_id[threadIdx.x] = i;
178 other_x[threadIdx.x] = x_i;
179 other_x[threadIdx.x + blockDim.x] = y_i;
180 other_x[threadIdx.x + 2 * blockDim.x] = z_i;
181 } else {
182 other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
183 my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
184 other_x[threadIdx.x] = *my_x;
185 my_x += bin_nmax;
186 other_x[threadIdx.x + blockDim.x] = *my_x;
187 my_x += bin_nmax;
188 other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
189
190 }
191 }
192
193 __syncthreads();
194 int kk = threadIdx.x;
195
196 for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
197 if(i < natoms) {
198 kk++;
199 kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
200 int j = other_id[kk];
201
202 if(exclude && exclusion(i, j, itype, _type[j])) continue;
203
204 if(globcutoff < 0) {
205 int jtype = _type[j];
206 cut = _cutneighsq[itype * _cuda_ntypes + jtype];
207 }
208
209 CUDA_FLOAT delx = x_i - other_x[kk];
210 CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
211 CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
212 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
213
214
215 if(rsq <= cut && i != j) {
216 if(jnum < _maxneighbors) {
217 if(block_style)
218 _neighbors[i * _maxneighbors + jnum] = j;
219 else
220 _neighbors[i + jnum * natoms] = j;
221 }
222
223 ++jnum;
224 }
225 }
226 }
227
228 __syncthreads();
229
230 }
231
232 for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
233 for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
234 for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
235 if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
236
237 if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
238
239 int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
240
241 if(other_bin == bin) continue;
242
243 int obin_c = bin_count[other_bin];
244
245 for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
246 int otherActIdx = otherActOffset + threadIdx.x;
247
248 if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
249 other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
250 my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
251 other_x[threadIdx.x] = *my_x;
252 my_x += bin_nmax;
253 other_x[threadIdx.x + blockDim.x] = *my_x;
254 my_x += bin_nmax;
255 other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
256 }
257
258 __syncthreads();
259
260 for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
261 if(i < natoms) {
262 int j = other_id[k];
263
264 if(exclude && exclusion(i, j, itype, _type[j])) continue;
265
266 if(globcutoff < 0) {
267 int jtype = _type[j];
268 cut = _cutneighsq[itype * _cuda_ntypes + jtype];
269 }
270
271 CUDA_FLOAT delx = x_i - other_x[k];
272 CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
273 CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
274 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
275
276 if(rsq <= cut && i != j) {
277 if(jnum < _maxneighbors) {
278 if(block_style)
279 _neighbors[i * _maxneighbors + jnum] = j;
280 else
281 _neighbors[i + jnum * natoms] = j;
282 }
283
284 ++jnum;
285 }
286 }
287 }
288
289 __syncthreads();
290
291 }
292 }
293
294 if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
295
296 if(i < natoms)
297 _numneigh[i] = jnum;
298 }
299 }
300
301
FindSpecial(int block_style)302 __global__ void FindSpecial(int block_style)
303 {
304 int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
305 int which;
306 int tag_mask = 0;
307 int3 spec_flag;
308
309 int3 mynspecial = {0, 0, 1};
310
311 if(ii >= _nlocal) return;
312
313 int special_id[CUDA_MAX_NSPECIAL];
314
315 int i = _ilist[ii];
316
317 if(i >= _nlocal) return;
318
319 int jnum = _numneigh[i];
320
321 if(_special_flag[1] == 0) spec_flag.x = -1;
322 else if(_special_flag[1] == 1) spec_flag.x = 0;
323 else spec_flag.x = 1;
324
325 if(_special_flag[2] == 0) spec_flag.y = -1;
326 else if(_special_flag[2] == 1) spec_flag.y = 0;
327 else spec_flag.y = 2;
328
329 if(_special_flag[3] == 0) spec_flag.z = -1;
330 else if(_special_flag[3] == 1) spec_flag.z = 0;
331 else spec_flag.z = 3;
332
333 mynspecial.x = _nspecial[i];
334 mynspecial.y = _nspecial[i + _nmax];
335 mynspecial.z = _nspecial[i + 2 * _nmax];
336
337 if(i < _nlocal) {
338 int* list = &_special[i];
339
340 for(int k = 0; k < mynspecial.z; k++) {
341 special_id[k] = list[k * _nmax];
342 tag_mask = tag_mask | special_id[k];
343 }
344 }
345
346
347 for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
348 int j;
349
350 if(block_style)
351 j = _neighbors[i * _maxneighbors + k];
352 else
353 j = _neighbors[i + k * _nlocal];
354
355 int tag_j = _tag[j];
356 which = 0;
357
358 if((tag_mask & tag_j) == tag_j) {
359 which = find_special(mynspecial, special_id, tag_j, spec_flag);
360
361 if(which > 0) {
362 if(block_style)
363 _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
364 else
365 _neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
366 } else if(which < 0) {
367 if(block_style)
368 _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
369 else
370 _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
371
372 jnum--;
373 k--;
374 }
375 }
376 }
377
378 _numneigh[i] = jnum;
379 }
380
NeighborBuildFullBin_OverlapComm_Kernel(int * binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)381 __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
382 {
383 int bin_dim_z = gridDim.y;
384 CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
385 binned_x = &binned_x[2];
386 int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
387 int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
388 int bin_x = blockIdx.x / bin_dim_y;
389 int bin_y = blockIdx.x - bin_x * bin_dim_y;
390 int bin_z = blockIdx.y;
391 int bin_c = bin_count[bin];
392
393
394 CUDA_FLOAT cut;
395
396 if(globcutoff > 0)
397 cut = globcutoff;
398
399 int i = _nall;
400 CUDA_FLOAT* my_x;
401 CUDA_FLOAT x_i, y_i, z_i;
402
403 for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
404
405 int actIdx = threadIdx.x + actOffset;
406 CUDA_FLOAT* other_x = shared;
407 int* other_id = (int*) &other_x[3 * blockDim.x];
408
409 if(actIdx < bin_c) {
410 i = binned_id[__mul24(bin, bin_nmax) + actIdx];
411 my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
412 x_i = *my_x;
413 my_x += bin_nmax;
414 y_i = *my_x;
415 my_x += bin_nmax;
416 z_i = *my_x;
417 } else
418 i = 2 * _nall;
419
420 __syncthreads();
421
422 int jnum = 0;
423 int jnum_border = 0;
424 int jnum_inner = 0;
425 int i_border = -1;
426 int itype;
427
428 if(i < _nlocal) {
429 jnum = 0;
430 _ilist[i] = i;
431 itype = _type[i];
432 }
433
434 __syncthreads();
435
436
437 for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
438 int otherActIdx = threadIdx.x + otherActOffset;
439
440 if(otherActIdx < bin_c) {
441 if(otherActOffset == actOffset) {
442 other_id[threadIdx.x] = i;
443 other_x[threadIdx.x] = x_i;
444 other_x[threadIdx.x + blockDim.x] = y_i;
445 other_x[threadIdx.x + 2 * blockDim.x] = z_i;
446 } else {
447 other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
448 my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
449 other_x[threadIdx.x] = *my_x;
450 my_x += bin_nmax;
451 other_x[threadIdx.x + blockDim.x] = *my_x;
452 my_x += bin_nmax;
453 other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
454
455 }
456 }
457
458 __syncthreads();
459 int kk = threadIdx.x;
460
461 for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
462 if(i < _nlocal) {
463 kk++;
464 kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
465 int j = other_id[kk];
466
467 if(globcutoff < 0) {
468 int jtype = _type[j];
469 cut = _cutneighsq[itype * _cuda_ntypes + jtype];
470 }
471
472 CUDA_FLOAT delx = x_i - other_x[kk];
473 CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
474 CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
475 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
476
477
478 if(rsq <= cut && i != j) {
479 if((j >= _nlocal) && (i_border < 0))
480 i_border = atomicAdd(_inum_border, 1);
481
482 if(jnum < _maxneighbors) {
483 if(block_style) {
484 _neighbors[i * _maxneighbors + jnum] = j;
485
486 if(j >= _nlocal) {
487 _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
488 } else {
489 _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
490 }
491 } else {
492 _neighbors[i + jnum * _nlocal] = j;
493
494 if(j >= _nlocal) {
495 _neighbors_border[i_border + jnum_border * _nlocal] = j;
496 } else {
497 _neighbors_inner[i + jnum_inner * _nlocal] = j;
498 }
499 }
500 }
501
502 ++jnum;
503
504 if(j >= _nlocal)
505 jnum_border++;
506 else
507 jnum_inner++;
508 }
509 }
510 }
511
512 __syncthreads();
513 }
514
515 for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
516 for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
517 for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
518 if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
519
520 if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
521
522 int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
523
524 if(other_bin == bin) continue;
525
526 int obin_c = bin_count[other_bin];
527
528 for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
529 int otherActIdx = otherActOffset + threadIdx.x;
530
531 if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
532 other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
533 my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
534 other_x[threadIdx.x] = *my_x;
535 my_x += bin_nmax;
536 other_x[threadIdx.x + blockDim.x] = *my_x;
537 my_x += bin_nmax;
538 other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
539 }
540
541 __syncthreads();
542
543 for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
544 if(i < _nlocal) {
545 int j = other_id[k];
546
547 if(globcutoff < 0) {
548 int jtype = _type[j];
549 cut = _cutneighsq[itype * _cuda_ntypes + jtype];
550 }
551
552 CUDA_FLOAT delx = x_i - other_x[k];
553 CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
554 CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
555 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
556
557 if(rsq <= cut && i != j) {
558 if((j >= _nlocal) && (i_border < 0))
559 i_border = atomicAdd(_inum_border, 1);
560
561 if(jnum < _maxneighbors) {
562 if(block_style) {
563 _neighbors[i * _maxneighbors + jnum] = j;
564
565 if(j >= _nlocal) {
566 _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
567 } else {
568 _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
569 }
570 } else {
571 _neighbors[i + jnum * _nlocal] = j;
572
573 if(j >= _nlocal) {
574 _neighbors_border[i_border + jnum_border * _nlocal] = j;
575 } else {
576 _neighbors_inner[i + jnum_inner * _nlocal] = j;
577 }
578 }
579 }
580
581 ++jnum;
582
583 if(j >= _nlocal)
584 jnum_border++;
585 else
586 jnum_inner++;
587 }
588 }
589 }
590
591 __syncthreads();
592 }
593 }
594
595 if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
596
597 if(i < _nlocal) {
598 _numneigh[i] = jnum;
599 _numneigh_inner[i] = jnum_inner;
600
601 if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
602
603 if(i_border >= 0) _ilist_border[i_border] = i;
604
605 }
606 }
607 }
608
NeighborBuildFullNsq_Kernel()609 __global__ void NeighborBuildFullNsq_Kernel()
610 {
611 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
612 int* buffer = (int*) _buffer;
613
614 if(i < _nlocal) {
615 X_FLOAT* my_x = _x + i;
616 CUDA_FLOAT x_i = *my_x;
617 my_x += _nmax;
618 CUDA_FLOAT y_i = *my_x;
619 my_x += _nmax;
620 CUDA_FLOAT z_i = *my_x;
621 int jnum = 0;
622 int* jlist = _firstneigh[i];
623 _ilist[i] = i;
624
625 int itype = _type[i];
626 __syncthreads();
627
628 for(int j = 0; j < _nall; ++j) {
629 my_x = _x + j;
630 CUDA_FLOAT x_j = *my_x;
631 my_x += _nmax;
632 CUDA_FLOAT y_j = *my_x;
633 my_x += _nmax;
634 CUDA_FLOAT z_j = *my_x;
635 CUDA_FLOAT delx = x_i - x_j;
636 CUDA_FLOAT dely = y_i - y_j;
637 CUDA_FLOAT delz = z_i - z_j;
638 CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
639 int jtype = _type[j];
640
641 if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
642 if(jnum < _maxneighbors)
643 jlist[jnum] = j;
644
645 if(i == 151)((int*)_buffer)[jnum + 2] = j;
646
647 ++jnum;
648 }
649
650 __syncthreads();
651 }
652
653 if(jnum > _maxneighbors) buffer[0] = 0;
654
655 _numneigh[i] = jnum;
656
657 if(i == 151)((int*)_buffer)[1] = jnum;
658 }
659 }
660
661