1 // clang-format off
2 /* ----------------------------------------------------------------------
3 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4 https://www.lammps.org/, Sandia National Laboratories
5 Steve Plimpton, sjplimp@sandia.gov
6
7 Copyright (2003) Sandia Corporation. Under the terms of Contract
8 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9 certain rights in this software. This software is distributed under
10 the GNU General Public License.
11
12 See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14
15 /* ----------------------------------------------------------------------
16 Contributing authors: W. Michael Brown (Intel)
17 ------------------------------------------------------------------------- */
18
19 #include "npair_full_bin_ghost_intel.h"
20
21 #include "atom.h"
22 #include "comm.h"
23 #include "domain.h"
24 #include "error.h"
25 #include "neigh_list.h"
26 #include "neighbor.h"
27
28 #include <cmath>
29
30 using namespace LAMMPS_NS;
31
32 /* ---------------------------------------------------------------------- */
33
NPairFullBinGhostIntel(LAMMPS * lmp)34 NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
35
36 /* ----------------------------------------------------------------------
37 binned neighbor list construction for all neighbors
38 include neighbors of ghost atoms, but no "special neighbors" for ghosts
39 every neighbor pair appears in list of both atoms i and j
40 ------------------------------------------------------------------------- */
41
build(NeighList * list)42 void NPairFullBinGhostIntel::build(NeighList *list)
43 {
44 #ifdef _LMP_INTEL_OFFLOAD
45 if (_fix->offload_noghost())
46 error->all(FLERR,
47 "The 'ghost no' option cannot be used with this INTEL pair style.");
48 #endif
49
50 if (nstencil > INTEL_MAX_STENCIL_CHECK)
51 error->all(FLERR, "Too many neighbor bins for INTEL package.");
52
53 #ifdef _LMP_INTEL_OFFLOAD
54 if (exclude)
55 error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
56 #endif
57
58 if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
59 fbi(list, _fix->get_mixed_buffers());
60 else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
61 fbi(list, _fix->get_double_buffers());
62 else
63 fbi(list, _fix->get_single_buffers());
64
65 _fix->stop_watch(TIME_HOST_NEIGHBOR);
66 }
67
68 /* ---------------------------------------------------------------------- */
69
70 template<class flt_t, class acc_t>
fbi(NeighList * list,IntelBuffers<flt_t,acc_t> * buffers)71 void NPairFullBinGhostIntel::fbi(NeighList * list,
72 IntelBuffers<flt_t,acc_t> * buffers)
73 {
74 const int nlocal = atom->nlocal;
75 const int nall = atom->nlocal + atom->nghost;
76 list->inum = atom->nlocal;
77 list->gnum = atom->nghost;
78
79 int host_start = _fix->host_start_neighbor();
80 const int off_end = _fix->offload_end_neighbor();
81
82 #ifdef _LMP_INTEL_OFFLOAD
83 if (off_end) grow_stencil();
84 if (_fix->full_host_list()) host_start = 0;
85 int offload_noghost = _fix->offload_noghost();
86 #endif
87
88 // only uses offload_end_neighbor to check whether we are doing offloading
89 // at all, no need to correct this later
90 buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
91 _fix->nbor_pack_width());
92
93 int need_ic = 0;
94 if (atom->molecular != Atom::ATOMIC)
95 dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
96 neighbor->cutneighmax);
97
98 if (need_ic) {
99 fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
100 fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
101 } else {
102 fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
103 fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
104 }
105 }
106
107 /* ---------------------------------------------------------------------- */
108
109 template<class flt_t, class acc_t, int need_ic>
fbi(const int offload,NeighList * list,IntelBuffers<flt_t,acc_t> * buffers,const int pstart,const int pend)110 void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
111 IntelBuffers<flt_t,acc_t> * buffers,
112 const int pstart, const int pend) {
113 if (pend-pstart == 0) return;
114
115 const int nall = atom->nlocal + atom->nghost;
116 int nall_t = nall;
117 const int aend = nall;
118
119 const ATOM_T * _noalias const x = buffers->get_x();
120 int * _noalias const intel_list = buffers->intel_list(list);
121 int ** _noalias const firstneigh = list->firstneigh;
122 const int e_nall = nall_t;
123
124 const int molecular = atom->molecular;
125 int *ns = nullptr;
126 tagint *s = nullptr;
127 int tag_size = 0, special_size;
128 if (buffers->need_tag()) tag_size = e_nall;
129 if (molecular != Atom::ATOMIC) {
130 s = atom->special[0];
131 ns = atom->nspecial[0];
132 special_size = aend;
133 } else {
134 s = &buffers->_special_holder;
135 ns = &buffers->_nspecial_holder;
136 special_size = 0;
137 }
138 const tagint * _noalias const special = s;
139 const int * _noalias const nspecial = ns;
140 const int maxspecial = atom->maxspecial;
141 const tagint * _noalias const tag = atom->tag;
142
143 int * _noalias const ilist = list->ilist;
144 int * _noalias numneigh = list->numneigh;
145 const int nstencil = this->nstencil;
146 const int * _noalias const stencil = this->stencil;
147 const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
148 const flt_t * _noalias const cutneighghostsq =
149 buffers->get_cutneighghostsq()[0];
150 const int ntypes = atom->ntypes + 1;
151 const int nlocal = atom->nlocal;
152
153 #ifndef _LMP_INTEL_OFFLOAD
154 int * _noalias const mask = atom->mask;
155 tagint * _noalias const molecule = atom->molecule;
156 #endif
157
158 int moltemplate;
159 if (molecular == Atom::TEMPLATE) moltemplate = 1;
160 else moltemplate = 0;
161 if (moltemplate)
162 error->all(FLERR,
163 "Can't use moltemplate with npair style full/bin/ghost/intel.");
164
165 int tnum;
166 int * _noalias overflow;
167 #ifdef _LMP_INTEL_OFFLOAD
168 double *timer_compute;
169 if (offload) {
170 timer_compute = _fix->off_watch_neighbor();
171 tnum = buffers->get_off_threads();
172 overflow = _fix->get_off_overflow_flag();
173 _fix->stop_watch(TIME_HOST_NEIGHBOR);
174 _fix->start_watch(TIME_OFFLOAD_LATENCY);
175 } else
176 #endif
177 {
178 tnum = comm->nthreads;
179 overflow = _fix->get_overflow_flag();
180 }
181 const int nthreads = tnum;
182 const int maxnbors = buffers->get_max_nbors();
183 int * _noalias const atombin = buffers->get_atombin();
184 const int * _noalias const binpacked = buffers->get_binpacked();
185
186 const int xperiodic = domain->xperiodic;
187 const int yperiodic = domain->yperiodic;
188 const int zperiodic = domain->zperiodic;
189 const flt_t xprd_half = domain->xprd_half;
190 const flt_t yprd_half = domain->yprd_half;
191 const flt_t zprd_half = domain->zprd_half;
192
193 flt_t * _noalias const ncachex = buffers->get_ncachex();
194 flt_t * _noalias const ncachey = buffers->get_ncachey();
195 flt_t * _noalias const ncachez = buffers->get_ncachez();
196 int * _noalias const ncachej = buffers->get_ncachej();
197 int * _noalias const ncachejtype = buffers->get_ncachejtype();
198 tagint * _noalias const ncachetag = buffers->get_ncachetag();
199 const int ncache_stride = buffers->ncache_stride();
200
201 const int mbinx = this->mbinx;
202 const int mbiny = this->mbiny;
203 const int mbinz = this->mbinz;
204 const int * _noalias const stencilxyz = &this->stencilxyz[0][0];
205
206 int sb = 1;
207 if (special_flag[1] == 0) {
208 sb = 2;
209 if (special_flag[2] == 0) {
210 sb = 3;
211 if (special_flag[3] == 0)
212 sb = 4;
213 }
214 }
215 const int special_bound = sb;
216
217 #ifdef _LMP_INTEL_OFFLOAD
218 const int * _noalias const binhead = this->binhead;
219 const int * _noalias const bins = this->bins;
220 const int cop = _fix->coprocessor_number();
221 const int separate_buffers = _fix->separate_buffers();
222 #pragma offload target(mic:cop) if (offload) \
223 in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
224 in(tag:length(tag_size) alloc_if(0) free_if(0)) \
225 in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
226 in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
227 in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
228 in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
229 in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
230 in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
231 in(firstneigh:length(0) alloc_if(0) free_if(0)) \
232 in(intel_list:length(0) alloc_if(0) free_if(0)) \
233 in(numneigh:length(0) alloc_if(0) free_if(0)) \
234 in(ilist:length(0) alloc_if(0) free_if(0)) \
235 in(atombin:length(aend) alloc_if(0) free_if(0)) \
236 in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
237 in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
238 in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
239 in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
240 in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\
241 in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
242 in(stencilxyz:length(3*nstencil)) \
243 out(overflow:length(5) alloc_if(0) free_if(0)) \
244 out(timer_compute:length(1) alloc_if(0) free_if(0)) \
245 signal(tag)
246 #endif
247 {
248 #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
249 *timer_compute = MIC_Wtime();
250 #endif
251
252 #ifdef _LMP_INTEL_OFFLOAD
253 overflow[LMP_LOCAL_MIN] = 0;
254 overflow[LMP_LOCAL_MAX] = aend - 1;
255 overflow[LMP_GHOST_MIN] = e_nall;
256 overflow[LMP_GHOST_MAX] = -1;
257 #endif
258
259 int nstencilp = 0;
260 int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
261 for (int k = 0; k < nstencil; k++) {
262 binstart[nstencilp] = stencil[k];
263 int end = stencil[k] + 1;
264 for (int kk = k + 1; kk < nstencil; kk++) {
265 if (stencil[kk-1]+1 == stencil[kk]) {
266 end++;
267 k++;
268 } else break;
269 }
270 binend[nstencilp] = end;
271 nstencilp++;
272 }
273
274 const int mbinyx = mbiny * mbinx;
275
276 #if defined(_OPENMP)
277 #pragma omp parallel
278 #endif
279 {
280 const int num = aend;
281 int tid, ifrom, ito;
282
283 const double balance_factor = 2.0;
284 const double ibalance_factor = 1.0 / balance_factor;
285 const int gnum = num - nlocal;
286 const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
287 const int snum = wlocal + gnum;
288 IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
289 if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
290 else ifrom -= wlocal - nlocal;
291 if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
292 else ito -= wlocal - nlocal;
293
294 int e_ito = ito;
295 const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
296
297 int pack_offset = maxnbors;
298 int ct = (ifrom + tid * 2) * maxnbors;
299 int * _noalias neighptr = intel_list + ct;
300 const int obound = pack_offset + maxnbors * 2;
301
302 const int toffs = tid * ncache_stride;
303 flt_t * _noalias const tx = ncachex + toffs;
304 flt_t * _noalias const ty = ncachey + toffs;
305 flt_t * _noalias const tz = ncachez + toffs;
306 int * _noalias const tj = ncachej + toffs;
307 int * _noalias const tjtype = ncachejtype + toffs;
308 tagint * _noalias const ttag = ncachetag + toffs;
309
310 // loop over all atoms in other bins in stencil, store every pair
311 int ncount, oldbin = -9999999;
312 for (int i = ifrom; i < ito; i++) {
313 const flt_t xtmp = x[i].x;
314 const flt_t ytmp = x[i].y;
315 const flt_t ztmp = x[i].z;
316 const int itype = x[i].w;
317 const tagint itag = tag[i];
318 const int ioffset = ntypes * itype;
319
320 const int ibin = atombin[i];
321 if (ibin != oldbin) {
322 oldbin = ibin;
323 ncount = 0;
324 if (i < nlocal) {
325 for (int k = 0; k < nstencilp; k++) {
326 const int bstart = binhead[ibin + binstart[k]];
327 const int bend = binhead[ibin + binend[k]];
328 #if defined(LMP_SIMD_COMPILER)
329 #if defined(USE_OMP_SIMD)
330 #pragma omp simd
331 #else
332 #pragma simd
333 #endif
334 #endif
335 for (int jj = bstart; jj < bend; jj++)
336 tj[ncount++] = binpacked[jj];
337 }
338 } else {
339 const int zbin = ibin / mbinyx;
340 const int zrem = ibin % mbinyx;
341 const int ybin = zrem / mbinx;
342 const int xbin = zrem % mbinx;
343 for (int k = 0; k < nstencil; k++) {
344 const int xbin2 = xbin + stencilxyz[3 * k + 0];
345 const int ybin2 = ybin + stencilxyz[3 * k + 1];
346 const int zbin2 = zbin + stencilxyz[3 * k + 2];
347 if (xbin2 < 0 || xbin2 >= mbinx ||
348 ybin2 < 0 || ybin2 >= mbiny ||
349 zbin2 < 0 || zbin2 >= mbinz) continue;
350
351 const int bstart = binhead[ibin + stencil[k]];
352 const int bend = binhead[ibin + stencil[k] + 1];
353 #if defined(LMP_SIMD_COMPILER)
354 #if defined(USE_OMP_SIMD)
355 #pragma omp simd
356 #else
357 #pragma simd
358 #endif
359 #endif
360 for (int jj = bstart; jj < bend; jj++)
361 tj[ncount++] = binpacked[jj];
362 }
363 } // if i < nlocal
364 #if defined(LMP_SIMD_COMPILER)
365 #if defined(USE_OMP_SIMD)
366 #pragma omp simd
367 #else
368 #pragma simd
369 #endif
370 #pragma vector aligned
371 #endif
372 for (int u = 0; u < ncount; u++) {
373 const int j = tj[u];
374 tx[u] = x[j].x;
375 ty[u] = x[j].y;
376 tz[u] = x[j].z;
377 tjtype[u] = x[j].w;
378 ttag[u] = tag[j];
379 }
380 } // if ibin != oldbin
381
382 // ---------------------- Loop over other bins
383
384 int n = maxnbors;
385 int n2 = n * 2;
386 int * _noalias neighptr2 = neighptr;
387 const flt_t * _noalias cutsq;
388 if (i < nlocal) cutsq = cutneighsq;
389 else cutsq = cutneighghostsq;
390
391 const int icp = i;
392
393 #if defined(LMP_SIMD_COMPILER)
394 #pragma vector aligned
395 #pragma ivdep
396 #endif
397 for (int u = 0; u < ncount; u++) {
398 int addme = 1;
399 int j = tj[u];
400
401 if (i == j) addme = 0;
402
403 // Cutoff Check
404 const flt_t delx = xtmp - tx[u];
405 const flt_t dely = ytmp - ty[u];
406 const flt_t delz = ztmp - tz[u];
407 const int jtype = tjtype[u];
408 const tagint jtag = ttag[u];
409 const flt_t rsq = delx * delx + dely * dely + delz * delz;
410 if (rsq > cutsq[ioffset + jtype]) addme = 0;
411
412 if (need_ic && icp < nlocal) {
413 int no_special;
414 ominimum_image_check(no_special, delx, dely, delz);
415 if (no_special)
416 j = -j - 1;
417 }
418
419 int flist = 0;
420 if (itag > jtag) {
421 if (((itag+jtag) & 1) == 0) flist = 1;
422 } else if (itag < jtag) {
423 if (((itag+jtag) & 1) == 1) flist = 1;
424 } else {
425 if (tz[u] < ztmp) flist = 1;
426 else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
427 else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
428 flist = 1;
429 }
430 if (addme) {
431 if (flist)
432 neighptr2[n2++] = j;
433 else
434 neighptr[n++] = j;
435 }
436 } // for u
437
438 if ((molecular != Atom::ATOMIC) && (i < nlocal)) {
439 int alln = n;
440 n = 0;
441 #if defined(LMP_SIMD_COMPILER)
442 #ifdef LMP_INTEL_NBOR_COMPAT
443 #pragma ivdep
444 #else
445 #if defined(USE_OMP_SIMD)
446 #pragma omp simd
447 #else
448 #pragma simd
449 #endif
450 #endif
451 #pragma vector aligned
452 #endif
453 for (int u = 0; u < alln; u++) {
454 int which;
455 int addme = 1;
456 int j = neighptr[u];
457 if (need_ic && j < 0) {
458 which = 0;
459 j = -j - 1;
460 } else
461 ofind_special(which, special, nspecial, i, tag[j]);
462 if (which) {
463 j = j ^ (which << SBBITS);
464 if (which < special_bound) addme = 0;
465 }
466 #ifdef LMP_INTEL_NBOR_COMPAT
467 if (addme) neighptr2[n++] = j;
468 #else
469 neighptr2[n++] = j;
470 #endif
471 }
472 alln = n2;
473 n2 = maxnbors * 2;
474 #if defined(LMP_SIMD_COMPILER)
475 #ifdef LMP_INTEL_NBOR_COMPAT
476 #pragma ivdep
477 #else
478 #if defined(USE_OMP_SIMD)
479 #pragma omp simd
480 #else
481 #pragma simd
482 #endif
483 #endif
484 #pragma vector aligned
485 #endif
486 for (int u = n2; u < alln; u++) {
487 int which;
488 int addme = 1;
489 int j = neighptr[u];
490 if (need_ic && j < 0) {
491 which = 0;
492 j = -j - 1;
493 } else
494 ofind_special(which, special, nspecial, i, tag[j]);
495 if (which) {
496 j = j ^ (which << SBBITS);
497 if (which < special_bound) addme = 0;
498 }
499 #ifdef LMP_INTEL_NBOR_COMPAT
500 if (addme) neighptr2[n2++] = j;
501 #else
502 neighptr2[n2++] = j;
503 #endif
504 }
505 }
506
507 #ifndef _LMP_INTEL_OFFLOAD
508 if (exclude) {
509 int alln = n;
510 n = maxnbors;
511 #if defined(LMP_SIMD_COMPILER)
512 #pragma vector aligned
513 #pragma ivdep
514 #endif
515 for (int u = n; u < alln; u++) {
516 int addme = 1;
517 const int js = neighptr[u];
518 const int j = js & NEIGHMASK;
519 const int jtype = x[j].w;
520 if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
521 if (addme) neighptr2[n++] = js;
522 }
523 alln = n2;
524 n2 = maxnbors * 2;
525 #if defined(LMP_SIMD_COMPILER)
526 #pragma vector aligned
527 #pragma ivdep
528 #endif
529 for (int u = n2; u < alln; u++) {
530 int addme = 1;
531 const int js = neighptr[u];
532 const int j = js & NEIGHMASK;
533 const int jtype = x[j].w;
534 if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
535 if (addme) neighptr2[n2++] = js;
536 }
537 }
538 #endif
539
540 int ns = n - maxnbors;
541 int alln = n;
542 atombin[i] = ns;
543 n = 0;
544 for (int u = maxnbors; u < alln; u++)
545 neighptr[n++] = neighptr2[u];
546 ns += n2 - maxnbors * 2;
547 for (int u = maxnbors * 2; u < n2; u++)
548 neighptr[n++] = neighptr2[u];
549 if (ns > maxnbors) *overflow = 1;
550
551 ilist[i] = i;
552 firstneigh[i] = intel_list + ct;
553 numneigh[i] = ns;
554
555 ct += ns;
556 IP_PRE_edge_align(ct, sizeof(int));
557 neighptr = intel_list + ct;
558 if (ct + obound > list_size) {
559 if (i < ito - 1) {
560 *overflow = 1;
561 ct = (ifrom + tid * 2) * maxnbors;
562 }
563 }
564 }
565
566 if (*overflow == 1)
567 for (int i = ifrom; i < ito; i++)
568 numneigh[i] = 0;
569
570 #ifdef _LMP_INTEL_OFFLOAD
571 if (separate_buffers) {
572 overflow[LMP_LOCAL_MIN] = 0;
573 overflow[LMP_LOCAL_MAX] = nlocal - 1;
574 overflow[LMP_GHOST_MIN] = nlocal;
575 overflow[LMP_GHOST_MAX] = e_nall - 1;
576 }
577 #endif
578 } // end omp
579 #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
580 *timer_compute = MIC_Wtime() - *timer_compute;
581 #endif
582 } // end offload
583
584 #ifdef _LMP_INTEL_OFFLOAD
585 if (offload) {
586 _fix->stop_watch(TIME_OFFLOAD_LATENCY);
587 _fix->start_watch(TIME_HOST_NEIGHBOR);
588 firstneigh[0] = intel_list;
589 for (int n = 0; n < aend; n++) {
590 ilist[n] = n;
591 numneigh[n] = 0;
592 }
593 } else {
594 if (separate_buffers) {
595 _fix->start_watch(TIME_PACK);
596 _fix->set_neighbor_host_sizes();
597 buffers->pack_sep_from_single(_fix->host_min_local(),
598 _fix->host_used_local(),
599 _fix->host_min_ghost(),
600 _fix->host_used_ghost());
601 _fix->stop_watch(TIME_PACK);
602 }
603 }
604 #endif
605 }
606