1 // clang-format off
2 /* ----------------------------------------------------------------------
3    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4    https://www.lammps.org/, Sandia National Laboratories
5    Steve Plimpton, sjplimp@sandia.gov
6 
7    Copyright (2003) Sandia Corporation.  Under the terms of Contract
8    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9    certain rights in this software.  This software is distributed under
10    the GNU General Public License.
11 
12    See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14 
15 /* ----------------------------------------------------------------------
16    Contributing authors: W. Michael Brown (Intel)
17 ------------------------------------------------------------------------- */
18 
19 #include "npair_full_bin_ghost_intel.h"
20 
21 #include "atom.h"
22 #include "comm.h"
23 #include "domain.h"
24 #include "error.h"
25 #include "neigh_list.h"
26 #include "neighbor.h"
27 
28 #include <cmath>
29 
30 using namespace LAMMPS_NS;
31 
32 /* ---------------------------------------------------------------------- */
33 
NPairFullBinGhostIntel(LAMMPS * lmp)34 NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
35 
36 /* ----------------------------------------------------------------------
37    binned neighbor list construction for all neighbors
38    include neighbors of ghost atoms, but no "special neighbors" for ghosts
39    every neighbor pair appears in list of both atoms i and j
40 ------------------------------------------------------------------------- */
41 
build(NeighList * list)42 void NPairFullBinGhostIntel::build(NeighList *list)
43 {
44   #ifdef _LMP_INTEL_OFFLOAD
45   if (_fix->offload_noghost())
46     error->all(FLERR,
47       "The 'ghost no' option cannot be used with this INTEL pair style.");
48   #endif
49 
50   if (nstencil > INTEL_MAX_STENCIL_CHECK)
51     error->all(FLERR, "Too many neighbor bins for INTEL package.");
52 
53   #ifdef _LMP_INTEL_OFFLOAD
54   if (exclude)
55     error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
56   #endif
57 
58   if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
59     fbi(list, _fix->get_mixed_buffers());
60   else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
61     fbi(list, _fix->get_double_buffers());
62   else
63     fbi(list, _fix->get_single_buffers());
64 
65   _fix->stop_watch(TIME_HOST_NEIGHBOR);
66 }
67 
68 /* ---------------------------------------------------------------------- */
69 
70 template<class flt_t, class acc_t>
fbi(NeighList * list,IntelBuffers<flt_t,acc_t> * buffers)71 void NPairFullBinGhostIntel::fbi(NeighList * list,
72                                  IntelBuffers<flt_t,acc_t> * buffers)
73 {
74   const int nlocal = atom->nlocal;
75   const int nall = atom->nlocal + atom->nghost;
76   list->inum = atom->nlocal;
77   list->gnum = atom->nghost;
78 
79   int host_start = _fix->host_start_neighbor();
80   const int off_end = _fix->offload_end_neighbor();
81 
82   #ifdef _LMP_INTEL_OFFLOAD
83   if (off_end) grow_stencil();
84   if (_fix->full_host_list()) host_start = 0;
85   int offload_noghost = _fix->offload_noghost();
86   #endif
87 
88   // only uses offload_end_neighbor to check whether we are doing offloading
89   // at all, no need to correct this later
90   buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
91                      _fix->nbor_pack_width());
92 
93   int need_ic = 0;
94   if (atom->molecular != Atom::ATOMIC)
95     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
96                          neighbor->cutneighmax);
97 
98   if (need_ic) {
99     fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
100     fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
101   } else {
102     fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
103     fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
104   }
105 }
106 
107 /* ---------------------------------------------------------------------- */
108 
109 template<class flt_t, class acc_t, int need_ic>
fbi(const int offload,NeighList * list,IntelBuffers<flt_t,acc_t> * buffers,const int pstart,const int pend)110 void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
111                                  IntelBuffers<flt_t,acc_t> * buffers,
112                                  const int pstart, const int pend) {
113   if (pend-pstart == 0) return;
114 
115   const int nall = atom->nlocal + atom->nghost;
116   int nall_t = nall;
117   const int aend = nall;
118 
119   const ATOM_T * _noalias const x = buffers->get_x();
120   int * _noalias const intel_list = buffers->intel_list(list);
121   int ** _noalias const firstneigh = list->firstneigh;
122   const int e_nall = nall_t;
123 
124   const int molecular = atom->molecular;
125   int *ns = nullptr;
126   tagint *s = nullptr;
127   int tag_size = 0, special_size;
128   if (buffers->need_tag()) tag_size = e_nall;
129   if (molecular != Atom::ATOMIC) {
130     s = atom->special[0];
131     ns = atom->nspecial[0];
132     special_size = aend;
133   } else {
134     s = &buffers->_special_holder;
135     ns = &buffers->_nspecial_holder;
136     special_size = 0;
137   }
138   const tagint * _noalias const special = s;
139   const int * _noalias const nspecial = ns;
140   const int maxspecial = atom->maxspecial;
141   const tagint * _noalias const tag = atom->tag;
142 
143   int * _noalias const ilist = list->ilist;
144   int * _noalias numneigh = list->numneigh;
145   const int nstencil = this->nstencil;
146   const int * _noalias const stencil = this->stencil;
147   const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
148   const flt_t * _noalias const cutneighghostsq =
149     buffers->get_cutneighghostsq()[0];
150   const int ntypes = atom->ntypes + 1;
151   const int nlocal = atom->nlocal;
152 
153   #ifndef _LMP_INTEL_OFFLOAD
154   int * _noalias const mask = atom->mask;
155   tagint * _noalias const molecule = atom->molecule;
156   #endif
157 
158   int moltemplate;
159   if (molecular == Atom::TEMPLATE) moltemplate = 1;
160   else moltemplate = 0;
161   if (moltemplate)
162     error->all(FLERR,
163                "Can't use moltemplate with npair style full/bin/ghost/intel.");
164 
165   int tnum;
166   int * _noalias overflow;
167   #ifdef _LMP_INTEL_OFFLOAD
168   double *timer_compute;
169   if (offload) {
170     timer_compute = _fix->off_watch_neighbor();
171     tnum = buffers->get_off_threads();
172     overflow = _fix->get_off_overflow_flag();
173     _fix->stop_watch(TIME_HOST_NEIGHBOR);
174     _fix->start_watch(TIME_OFFLOAD_LATENCY);
175   } else
176   #endif
177   {
178     tnum = comm->nthreads;
179     overflow = _fix->get_overflow_flag();
180   }
181   const int nthreads = tnum;
182   const int maxnbors = buffers->get_max_nbors();
183   int * _noalias const atombin = buffers->get_atombin();
184   const int * _noalias const binpacked = buffers->get_binpacked();
185 
186   const int xperiodic = domain->xperiodic;
187   const int yperiodic = domain->yperiodic;
188   const int zperiodic = domain->zperiodic;
189   const flt_t xprd_half = domain->xprd_half;
190   const flt_t yprd_half = domain->yprd_half;
191   const flt_t zprd_half = domain->zprd_half;
192 
193   flt_t * _noalias const ncachex = buffers->get_ncachex();
194   flt_t * _noalias const ncachey = buffers->get_ncachey();
195   flt_t * _noalias const ncachez = buffers->get_ncachez();
196   int * _noalias const ncachej = buffers->get_ncachej();
197   int * _noalias const ncachejtype = buffers->get_ncachejtype();
198   tagint * _noalias const ncachetag = buffers->get_ncachetag();
199   const int ncache_stride = buffers->ncache_stride();
200 
201   const int mbinx = this->mbinx;
202   const int mbiny = this->mbiny;
203   const int mbinz = this->mbinz;
204   const int * _noalias const stencilxyz = &this->stencilxyz[0][0];
205 
206   int sb = 1;
207   if (special_flag[1] == 0) {
208     sb = 2;
209     if (special_flag[2] == 0) {
210       sb = 3;
211       if (special_flag[3] == 0)
212         sb = 4;
213     }
214   }
215   const int special_bound = sb;
216 
217   #ifdef _LMP_INTEL_OFFLOAD
218   const int * _noalias const binhead = this->binhead;
219   const int * _noalias const bins = this->bins;
220   const int cop = _fix->coprocessor_number();
221   const int separate_buffers = _fix->separate_buffers();
222   #pragma offload target(mic:cop) if (offload) \
223     in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
224     in(tag:length(tag_size) alloc_if(0) free_if(0)) \
225     in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
226     in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
227     in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
228     in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
229     in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
230     in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
231     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
232     in(intel_list:length(0) alloc_if(0) free_if(0)) \
233     in(numneigh:length(0) alloc_if(0) free_if(0)) \
234     in(ilist:length(0) alloc_if(0) free_if(0)) \
235     in(atombin:length(aend) alloc_if(0) free_if(0)) \
236     in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
237     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
238     in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
239     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
240     in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\
241     in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
242     in(stencilxyz:length(3*nstencil)) \
243     out(overflow:length(5) alloc_if(0) free_if(0)) \
244     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
245     signal(tag)
246   #endif
247   {
248     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
249     *timer_compute = MIC_Wtime();
250     #endif
251 
252     #ifdef _LMP_INTEL_OFFLOAD
253     overflow[LMP_LOCAL_MIN] = 0;
254     overflow[LMP_LOCAL_MAX] = aend - 1;
255     overflow[LMP_GHOST_MIN] = e_nall;
256     overflow[LMP_GHOST_MAX] = -1;
257     #endif
258 
259     int nstencilp = 0;
260     int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
261     for (int k = 0; k < nstencil; k++) {
262       binstart[nstencilp] = stencil[k];
263       int end = stencil[k] + 1;
264       for (int kk = k + 1; kk < nstencil; kk++) {
265         if (stencil[kk-1]+1 == stencil[kk]) {
266           end++;
267           k++;
268         } else break;
269       }
270       binend[nstencilp] = end;
271       nstencilp++;
272     }
273 
274     const int mbinyx = mbiny * mbinx;
275 
276     #if defined(_OPENMP)
277     #pragma omp parallel
278     #endif
279     {
280       const int num = aend;
281       int tid, ifrom, ito;
282 
283       const double balance_factor = 2.0;
284       const double ibalance_factor = 1.0 / balance_factor;
285       const int gnum = num - nlocal;
286       const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
287       const int snum = wlocal + gnum;
288       IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
289       if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
290       else ifrom -= wlocal - nlocal;
291       if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
292       else ito -= wlocal - nlocal;
293 
294       int e_ito = ito;
295       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
296 
297       int pack_offset = maxnbors;
298       int ct = (ifrom + tid * 2) * maxnbors;
299       int * _noalias neighptr = intel_list + ct;
300       const int obound = pack_offset + maxnbors * 2;
301 
302       const int toffs = tid * ncache_stride;
303       flt_t * _noalias const tx = ncachex + toffs;
304       flt_t * _noalias const ty = ncachey + toffs;
305       flt_t * _noalias const tz = ncachez + toffs;
306       int * _noalias const tj = ncachej + toffs;
307       int * _noalias const tjtype = ncachejtype + toffs;
308       tagint * _noalias const ttag = ncachetag + toffs;
309 
310       // loop over all atoms in other bins in stencil, store every pair
311       int ncount, oldbin = -9999999;
312       for (int i = ifrom; i < ito; i++) {
313         const flt_t xtmp = x[i].x;
314         const flt_t ytmp = x[i].y;
315         const flt_t ztmp = x[i].z;
316         const int itype = x[i].w;
317         const tagint itag = tag[i];
318         const int ioffset = ntypes * itype;
319 
320         const int ibin = atombin[i];
321         if (ibin != oldbin) {
322           oldbin = ibin;
323           ncount = 0;
324           if (i < nlocal) {
325             for (int k = 0; k < nstencilp; k++) {
326               const int bstart = binhead[ibin + binstart[k]];
327               const int bend = binhead[ibin + binend[k]];
328               #if defined(LMP_SIMD_COMPILER)
329 #if defined(USE_OMP_SIMD)
330               #pragma omp simd
331 #else
332               #pragma simd
333 #endif
334               #endif
335               for (int jj = bstart; jj < bend; jj++)
336                 tj[ncount++] = binpacked[jj];
337             }
338           } else {
339             const int zbin = ibin / mbinyx;
340             const int zrem = ibin % mbinyx;
341             const int ybin = zrem / mbinx;
342             const int xbin = zrem % mbinx;
343             for (int k = 0; k < nstencil; k++) {
344               const int xbin2 = xbin + stencilxyz[3 * k + 0];
345               const int ybin2 = ybin + stencilxyz[3 * k + 1];
346               const int zbin2 = zbin + stencilxyz[3 * k + 2];
347               if (xbin2 < 0 || xbin2 >= mbinx ||
348                   ybin2 < 0 || ybin2 >= mbiny ||
349                   zbin2 < 0 || zbin2 >= mbinz) continue;
350 
351               const int bstart = binhead[ibin + stencil[k]];
352               const int bend = binhead[ibin + stencil[k] + 1];
353               #if defined(LMP_SIMD_COMPILER)
354 #if defined(USE_OMP_SIMD)
355               #pragma omp simd
356 #else
357               #pragma simd
358 #endif
359               #endif
360               for (int jj = bstart; jj < bend; jj++)
361                 tj[ncount++] = binpacked[jj];
362             }
363           } // if i < nlocal
364           #if defined(LMP_SIMD_COMPILER)
365 #if defined(USE_OMP_SIMD)
366           #pragma omp simd
367 #else
368           #pragma simd
369 #endif
370           #pragma vector aligned
371           #endif
372           for (int u = 0; u < ncount; u++) {
373             const int j = tj[u];
374             tx[u] = x[j].x;
375             ty[u] = x[j].y;
376             tz[u] = x[j].z;
377             tjtype[u] = x[j].w;
378             ttag[u] = tag[j];
379           }
380         } // if ibin != oldbin
381 
382         // ---------------------- Loop over other bins
383 
384         int n = maxnbors;
385         int n2 = n * 2;
386         int * _noalias neighptr2 = neighptr;
387         const flt_t * _noalias cutsq;
388         if (i < nlocal) cutsq = cutneighsq;
389         else cutsq = cutneighghostsq;
390 
391         const int icp = i;
392 
393         #if defined(LMP_SIMD_COMPILER)
394         #pragma vector aligned
395         #pragma ivdep
396         #endif
397         for (int u = 0; u < ncount; u++) {
398           int addme = 1;
399           int j = tj[u];
400 
401           if (i == j) addme = 0;
402 
403           // Cutoff Check
404           const flt_t delx = xtmp - tx[u];
405           const flt_t dely = ytmp - ty[u];
406           const flt_t delz = ztmp - tz[u];
407           const int jtype = tjtype[u];
408           const tagint jtag = ttag[u];
409           const flt_t rsq = delx * delx + dely * dely + delz * delz;
410           if (rsq > cutsq[ioffset + jtype]) addme = 0;
411 
412           if (need_ic && icp < nlocal) {
413             int no_special;
414             ominimum_image_check(no_special, delx, dely, delz);
415             if (no_special)
416               j = -j - 1;
417           }
418 
419           int flist = 0;
420           if (itag > jtag) {
421             if (((itag+jtag) & 1) == 0) flist = 1;
422           } else if (itag < jtag) {
423             if (((itag+jtag) & 1) == 1) flist = 1;
424           } else {
425             if (tz[u] < ztmp) flist = 1;
426             else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
427             else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
428               flist = 1;
429           }
430           if (addme) {
431             if (flist)
432               neighptr2[n2++] = j;
433             else
434               neighptr[n++] = j;
435           }
436         } // for u
437 
438         if ((molecular != Atom::ATOMIC) && (i < nlocal)) {
439           int alln = n;
440           n = 0;
441           #if defined(LMP_SIMD_COMPILER)
442           #ifdef LMP_INTEL_NBOR_COMPAT
443           #pragma ivdep
444           #else
445 #if defined(USE_OMP_SIMD)
446           #pragma omp simd
447 #else
448           #pragma simd
449 #endif
450           #endif
451           #pragma vector aligned
452           #endif
453           for (int u = 0; u < alln; u++) {
454             int which;
455             int addme = 1;
456             int j = neighptr[u];
457             if (need_ic && j < 0) {
458               which = 0;
459               j = -j - 1;
460             } else
461               ofind_special(which, special, nspecial, i, tag[j]);
462             if (which) {
463               j = j ^ (which << SBBITS);
464               if (which < special_bound) addme = 0;
465             }
466             #ifdef LMP_INTEL_NBOR_COMPAT
467             if (addme) neighptr2[n++] = j;
468             #else
469             neighptr2[n++] = j;
470             #endif
471           }
472           alln = n2;
473           n2 = maxnbors * 2;
474           #if defined(LMP_SIMD_COMPILER)
475           #ifdef LMP_INTEL_NBOR_COMPAT
476           #pragma ivdep
477           #else
478 #if defined(USE_OMP_SIMD)
479           #pragma omp simd
480 #else
481           #pragma simd
482 #endif
483           #endif
484           #pragma vector aligned
485           #endif
486           for (int u = n2; u < alln; u++) {
487             int which;
488             int addme = 1;
489             int j = neighptr[u];
490             if (need_ic && j < 0) {
491               which = 0;
492               j = -j - 1;
493             } else
494               ofind_special(which, special, nspecial, i, tag[j]);
495             if (which) {
496               j = j ^ (which << SBBITS);
497               if (which < special_bound) addme = 0;
498             }
499             #ifdef LMP_INTEL_NBOR_COMPAT
500             if (addme) neighptr2[n2++] = j;
501             #else
502             neighptr2[n2++] = j;
503             #endif
504           }
505         }
506 
507         #ifndef _LMP_INTEL_OFFLOAD
508         if (exclude) {
509           int alln = n;
510           n = maxnbors;
511           #if defined(LMP_SIMD_COMPILER)
512           #pragma vector aligned
513           #pragma ivdep
514           #endif
515           for (int u = n; u < alln; u++) {
516             int addme = 1;
517             const int js = neighptr[u];
518             const int j = js & NEIGHMASK;
519             const int jtype = x[j].w;
520             if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
521             if (addme) neighptr2[n++] = js;
522           }
523           alln = n2;
524           n2 = maxnbors * 2;
525           #if defined(LMP_SIMD_COMPILER)
526           #pragma vector aligned
527           #pragma ivdep
528           #endif
529           for (int u = n2; u < alln; u++) {
530             int addme = 1;
531             const int js = neighptr[u];
532             const int j = js & NEIGHMASK;
533             const int jtype = x[j].w;
534             if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
535             if (addme) neighptr2[n2++] = js;
536           }
537         }
538         #endif
539 
540         int ns = n - maxnbors;
541         int alln = n;
542         atombin[i] = ns;
543         n = 0;
544         for (int u = maxnbors; u < alln; u++)
545           neighptr[n++] = neighptr2[u];
546         ns += n2 - maxnbors * 2;
547         for (int u = maxnbors * 2; u < n2; u++)
548           neighptr[n++] = neighptr2[u];
549         if (ns > maxnbors) *overflow = 1;
550 
551         ilist[i] = i;
552         firstneigh[i] = intel_list + ct;
553         numneigh[i] = ns;
554 
555         ct += ns;
556         IP_PRE_edge_align(ct, sizeof(int));
557         neighptr = intel_list + ct;
558         if (ct + obound > list_size) {
559           if (i < ito - 1) {
560             *overflow = 1;
561             ct = (ifrom + tid * 2) * maxnbors;
562           }
563         }
564       }
565 
566       if (*overflow == 1)
567         for (int i = ifrom; i < ito; i++)
568           numneigh[i] = 0;
569 
570       #ifdef _LMP_INTEL_OFFLOAD
571       if (separate_buffers) {
572         overflow[LMP_LOCAL_MIN] = 0;
573         overflow[LMP_LOCAL_MAX] = nlocal - 1;
574         overflow[LMP_GHOST_MIN] = nlocal;
575         overflow[LMP_GHOST_MAX] = e_nall - 1;
576       }
577       #endif
578     } // end omp
579     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
580     *timer_compute = MIC_Wtime() - *timer_compute;
581     #endif
582   } // end offload
583 
584   #ifdef _LMP_INTEL_OFFLOAD
585   if (offload) {
586     _fix->stop_watch(TIME_OFFLOAD_LATENCY);
587     _fix->start_watch(TIME_HOST_NEIGHBOR);
588     firstneigh[0] = intel_list;
589     for (int n = 0; n < aend; n++) {
590       ilist[n] = n;
591       numneigh[n] = 0;
592     }
593   } else {
594     if (separate_buffers) {
595       _fix->start_watch(TIME_PACK);
596       _fix->set_neighbor_host_sizes();
597       buffers->pack_sep_from_single(_fix->host_min_local(),
598                                     _fix->host_used_local(),
599                                     _fix->host_min_ghost(),
600                                     _fix->host_used_ghost());
601       _fix->stop_watch(TIME_PACK);
602     }
603   }
604   #endif
605 }
606