lib/cuda/neighbor_kernel.cu

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */

#define SBBITS 30

__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
                               CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  /*int* bin_count=(int*) _buffer;
  bin_count=bin_count+20;
  CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
  binned_x = &binned_x[2];
  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];

  if(i < _nall) {
    // copy atom position from global device memory to local register
    // in this 3 steps to get as much coalesced access as possible
    X_FLOAT* my_x = _x + i;
    CUDA_FLOAT x_i = *my_x;
    my_x += _nmax;
    CUDA_FLOAT y_i = *my_x;
    my_x += _nmax;
    CUDA_FLOAT z_i = *my_x;


    // calculate flat bin index
    int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
    int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
    int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;

    bx -= bx * negativCUDA(1.0f * bx);
    bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
    by -= by * negativCUDA(1.0f * by);
    by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
    bz -= bz * negativCUDA(1.0f * bz);
    bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);


    const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;

    // add new atom to bin, get bin-array position
    const unsigned k = atomicAdd(& bin_count[j], 1);

    if(k < bin_nmax) {
      binned_id [bin_nmax * j + k] = i;
      binned_x [3 * bin_nmax * j + k] = x_i;
      binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
      binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
    } else {
      // normally, this should not happen:
      int errorn = atomicAdd((int*) _buffer, 1);
      MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
    }
  }
}


__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
{
  int m;

  if(_nex_type)
    if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;

  if(_nex_group) {
    for(m = 0; m < _nex_group; m++) {
      if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;

      if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
    }
  }

  if(_nex_mol) {
    if(_molecule[i] == _molecule[j])
      for(m = 0; m < _nex_mol; m++)
        if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
  }

  return 0;
}

extern __shared__ CUDA_FLOAT shared[];

__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
{
  int k = n.z;

  for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);

  return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
}

template <const unsigned int exclude>
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
{
  int natoms = neighall ? _nall : _nlocal;
  //const bool domol=false;
  int bin_dim_z = gridDim.y;
  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
  binned_x = &binned_x[2];
  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
  int bin_x = blockIdx.x / bin_dim_y;
  int bin_y = blockIdx.x - bin_x * bin_dim_y;
  int bin_z = blockIdx.y;
  int bin_c = bin_count[bin];


  CUDA_FLOAT cut;

  if(globcutoff > 0)
    cut = globcutoff;

  int i = _nall;
  CUDA_FLOAT* my_x;
  CUDA_FLOAT x_i, y_i, z_i;

  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {

    int actIdx = threadIdx.x + actOffset;
    CUDA_FLOAT* other_x = shared;
    int* other_id = (int*) &other_x[3 * blockDim.x];

    if(actIdx < bin_c) {
      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
      x_i = *my_x;
      my_x += bin_nmax;
      y_i = *my_x;
      my_x += bin_nmax;
      z_i = *my_x;
    } else
      i = 2 * _nall;

    __syncthreads();

    int jnum = 0;
    int itype;

    if(i < natoms) {
      jnum = 0;
      _ilist[i] = i;
      itype = _type[i];
    }

    //__syncthreads();


    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
      int otherActIdx = threadIdx.x + otherActOffset;

      if(otherActIdx < bin_c) {
        if(otherActOffset == actOffset) {
          other_id[threadIdx.x] = i;
          other_x[threadIdx.x] = x_i;
          other_x[threadIdx.x + blockDim.x] = y_i;
          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
        } else {
          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
          other_x[threadIdx.x] = *my_x;
          my_x += bin_nmax;
          other_x[threadIdx.x + blockDim.x] = *my_x;
          my_x += bin_nmax;
          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;

        }
      }

      __syncthreads();
      int kk = threadIdx.x;

      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
        if(i < natoms) {
          kk++;
          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
          int j = other_id[kk];

          if(exclude && exclusion(i, j, itype, _type[j])) continue;

          if(globcutoff < 0) {
            int jtype = _type[j];
            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
          }

          CUDA_FLOAT delx = x_i - other_x[kk];
          CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
          CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
          CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;


          if(rsq <= cut && i != j) {
            if(jnum < _maxneighbors) {
              if(block_style)
                _neighbors[i * _maxneighbors + jnum] = j;
              else
                _neighbors[i + jnum * natoms] = j;
            }

            ++jnum;
          }
        }
      }

      __syncthreads();

    }

    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;

          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;

          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;

          if(other_bin == bin) continue;

          int obin_c = bin_count[other_bin];

          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
            int otherActIdx = otherActOffset + threadIdx.x;

            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
              other_x[threadIdx.x] = *my_x;
              my_x += bin_nmax;
              other_x[threadIdx.x + blockDim.x] = *my_x;
              my_x += bin_nmax;
              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
            }

            __syncthreads();

            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
              if(i < natoms) {
                int j = other_id[k];

                if(exclude && exclusion(i, j, itype, _type[j])) continue;

                if(globcutoff < 0) {
                  int jtype = _type[j];
                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
                }

                CUDA_FLOAT delx = x_i - other_x[k];
                CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
                CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
                CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;

                if(rsq <= cut && i != j) {
                  if(jnum < _maxneighbors) {
                    if(block_style)
                      _neighbors[i * _maxneighbors + jnum] = j;
                    else
                      _neighbors[i + jnum * natoms] = j;
                  }

                  ++jnum;
                }
              }
            }

            __syncthreads();

          }
        }

    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;

    if(i < natoms)
      _numneigh[i] = jnum;
  }
}


__global__ void FindSpecial(int block_style)
{
  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  int which;
  int tag_mask = 0;
  int3 spec_flag;

  int3 mynspecial = {0, 0, 1};

  if(ii >= _nlocal) return;

  int special_id[CUDA_MAX_NSPECIAL];

  int i = _ilist[ii];

  if(i >= _nlocal) return;

  int jnum = _numneigh[i];

  if(_special_flag[1] == 0) spec_flag.x = -1;
  else if(_special_flag[1] == 1) spec_flag.x = 0;
  else spec_flag.x = 1;

  if(_special_flag[2] == 0) spec_flag.y = -1;
  else if(_special_flag[2] == 1) spec_flag.y = 0;
  else spec_flag.y = 2;

  if(_special_flag[3] == 0) spec_flag.z = -1;
  else if(_special_flag[3] == 1) spec_flag.z = 0;
  else spec_flag.z = 3;

  mynspecial.x = _nspecial[i];
  mynspecial.y = _nspecial[i + _nmax];
  mynspecial.z = _nspecial[i + 2 * _nmax];

  if(i < _nlocal) {
    int* list = &_special[i];

    for(int k = 0; k < mynspecial.z; k++) {
      special_id[k] = list[k * _nmax];
      tag_mask = tag_mask | special_id[k];
    }
  }


  for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
    int j;

    if(block_style)
      j = _neighbors[i * _maxneighbors + k];
    else
      j = _neighbors[i + k * _nlocal];

    int tag_j = _tag[j];
    which = 0;

    if((tag_mask & tag_j) == tag_j) {
      which = find_special(mynspecial, special_id, tag_j, spec_flag);

      if(which > 0) {
        if(block_style)
          _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
        else
          _neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
      } else if(which < 0) {
        if(block_style)
          _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
        else
          _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];

        jnum--;
        k--;
      }
    }
  }

  _numneigh[i] = jnum;
}

__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
{
  int bin_dim_z = gridDim.y;
  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
  binned_x = &binned_x[2];
  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
  int bin_x = blockIdx.x / bin_dim_y;
  int bin_y = blockIdx.x - bin_x * bin_dim_y;
  int bin_z = blockIdx.y;
  int bin_c = bin_count[bin];


  CUDA_FLOAT cut;

  if(globcutoff > 0)
    cut = globcutoff;

  int i = _nall;
  CUDA_FLOAT* my_x;
  CUDA_FLOAT x_i, y_i, z_i;

  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {

    int actIdx = threadIdx.x + actOffset;
    CUDA_FLOAT* other_x = shared;
    int* other_id = (int*) &other_x[3 * blockDim.x];

    if(actIdx < bin_c) {
      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
      x_i = *my_x;
      my_x += bin_nmax;
      y_i = *my_x;
      my_x += bin_nmax;
      z_i = *my_x;
    } else
      i = 2 * _nall;

    __syncthreads();

    int jnum = 0;
    int jnum_border = 0;
    int jnum_inner = 0;
    int i_border = -1;
    int itype;

    if(i < _nlocal) {
      jnum = 0;
      _ilist[i] = i;
      itype = _type[i];
    }

    __syncthreads();


    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
      int otherActIdx = threadIdx.x + otherActOffset;

      if(otherActIdx < bin_c) {
        if(otherActOffset == actOffset) {
          other_id[threadIdx.x] = i;
          other_x[threadIdx.x] = x_i;
          other_x[threadIdx.x + blockDim.x] = y_i;
          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
        } else {
          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
          other_x[threadIdx.x] = *my_x;
          my_x += bin_nmax;
          other_x[threadIdx.x + blockDim.x] = *my_x;
          my_x += bin_nmax;
          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;

        }
      }

      __syncthreads();
      int kk = threadIdx.x;

      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
        if(i < _nlocal) {
          kk++;
          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
          int j = other_id[kk];

          if(globcutoff < 0) {
            int jtype = _type[j];
            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
          }

          CUDA_FLOAT delx = x_i - other_x[kk];
          CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
          CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
          CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;


          if(rsq <= cut && i != j) {
            if((j >= _nlocal) && (i_border < 0))
              i_border = atomicAdd(_inum_border, 1);

            if(jnum < _maxneighbors) {
              if(block_style) {
                _neighbors[i * _maxneighbors + jnum] = j;

                if(j >= _nlocal) {
                  _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
                } else {
                  _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
                }
              } else {
                _neighbors[i + jnum * _nlocal] = j;

                if(j >= _nlocal) {
                  _neighbors_border[i_border + jnum_border * _nlocal] = j;
                } else {
                  _neighbors_inner[i + jnum_inner * _nlocal] = j;
                }
              }
            }

            ++jnum;

            if(j >= _nlocal)
              jnum_border++;
            else
              jnum_inner++;
          }
        }
      }

      __syncthreads();
    }

    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;

          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;

          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;

          if(other_bin == bin) continue;

          int obin_c = bin_count[other_bin];

          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
            int otherActIdx = otherActOffset + threadIdx.x;

            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
              other_x[threadIdx.x] = *my_x;
              my_x += bin_nmax;
              other_x[threadIdx.x + blockDim.x] = *my_x;
              my_x += bin_nmax;
              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
            }

            __syncthreads();

            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
              if(i < _nlocal) {
                int j = other_id[k];

                if(globcutoff < 0) {
                  int jtype = _type[j];
                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
                }

                CUDA_FLOAT delx = x_i - other_x[k];
                CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
                CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
                CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;

                if(rsq <= cut && i != j) {
                  if((j >= _nlocal) && (i_border < 0))
                    i_border = atomicAdd(_inum_border, 1);

                  if(jnum < _maxneighbors) {
                    if(block_style) {
                      _neighbors[i * _maxneighbors + jnum] = j;

                      if(j >= _nlocal) {
                        _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
                      } else {
                        _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
                      }
                    } else {
                      _neighbors[i + jnum * _nlocal] = j;

                      if(j >= _nlocal) {
                        _neighbors_border[i_border + jnum_border * _nlocal] = j;
                      } else {
                        _neighbors_inner[i + jnum_inner * _nlocal] = j;
                      }
                    }
                  }

                  ++jnum;

                  if(j >= _nlocal)
                    jnum_border++;
                  else
                    jnum_inner++;
                }
              }
            }

            __syncthreads();
          }
        }

    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;

    if(i < _nlocal) {
      _numneigh[i] = jnum;
      _numneigh_inner[i] = jnum_inner;

      if(i_border >= 0) _numneigh_border[i_border] = jnum_border;

      if(i_border >= 0) _ilist_border[i_border] = i;

    }
  }
}

__global__ void NeighborBuildFullNsq_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  int* buffer = (int*) _buffer;

  if(i < _nlocal) {
    X_FLOAT* my_x = _x + i;
    CUDA_FLOAT x_i = *my_x;
    my_x += _nmax;
    CUDA_FLOAT y_i = *my_x;
    my_x += _nmax;
    CUDA_FLOAT z_i = *my_x;
    int jnum = 0;
    int* jlist = _firstneigh[i];
    _ilist[i] = i;

    int itype = _type[i];
    __syncthreads();

    for(int j = 0; j < _nall; ++j) {
      my_x = _x + j;
      CUDA_FLOAT x_j = *my_x;
      my_x += _nmax;
      CUDA_FLOAT y_j = *my_x;
      my_x += _nmax;
      CUDA_FLOAT z_j = *my_x;
      CUDA_FLOAT delx = x_i - x_j;
      CUDA_FLOAT dely = y_i - y_j;
      CUDA_FLOAT delz = z_i - z_j;
      CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
      int jtype = _type[j];

      if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
        if(jnum < _maxneighbors)
          jlist[jnum] = j;

        if(i == 151)((int*)_buffer)[jnum + 2] = j;

        ++jnum;
      }

      __syncthreads();
    }

    if(jnum > _maxneighbors) buffer[0] = 0;

    _numneigh[i] = jnum;

    if(i == 151)((int*)_buffer)[1] = jnum;
  }
}