/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #define SBBITS 30 __global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z, CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; /*int* bin_count=(int*) _buffer; bin_count=bin_count+20; CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; if(i < _nall) { // copy atom position from global device memory to local register // in this 3 steps to get as much coalesced access as possible X_FLOAT* my_x = _x + i; CUDA_FLOAT x_i = *my_x; my_x += _nmax; CUDA_FLOAT y_i = *my_x; my_x += _nmax; CUDA_FLOAT z_i = *my_x; // calculate flat bin index int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2; int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2; int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2; bx -= bx * negativCUDA(1.0f * bx); bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx); by -= by * negativCUDA(1.0f * by); by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by); bz -= bz * negativCUDA(1.0f * bz); bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz); const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz; // add new atom to bin, get bin-array position const unsigned k = atomicAdd(& bin_count[j], 1); if(k < bin_nmax) { binned_id [bin_nmax * j + k] = i; binned_x [3 * bin_nmax * j + k] = x_i; binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i; binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i; } else { // normally, this should not happen: int errorn = atomicAdd((int*) _buffer, 1); MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);) } } } __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype) { int m; if(_nex_type) if(_ex_type[itype * _cuda_ntypes + jtype]) return 1; if(_nex_group) { for(m = 0; m < _nex_group; m++) { if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1; if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1; } } if(_nex_mol) { if(_molecule[i] == _molecule[j]) for(m = 0; m < _nex_mol; m++) if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1; } return 0; } extern __shared__ CUDA_FLOAT shared[]; __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag) { int k = n.z; for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k); return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0)); } template __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall) { int natoms = neighall ? _nall : _nlocal; //const bool domol=false; int bin_dim_z = gridDim.y; CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; int bin_x = blockIdx.x / bin_dim_y; int bin_y = blockIdx.x - bin_x * bin_dim_y; int bin_z = blockIdx.y; int bin_c = bin_count[bin]; CUDA_FLOAT cut; if(globcutoff > 0) cut = globcutoff; int i = _nall; CUDA_FLOAT* my_x; CUDA_FLOAT x_i, y_i, z_i; for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { int actIdx = threadIdx.x + actOffset; CUDA_FLOAT* other_x = shared; int* other_id = (int*) &other_x[3 * blockDim.x]; if(actIdx < bin_c) { i = binned_id[__mul24(bin, bin_nmax) + actIdx]; my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx; x_i = *my_x; my_x += bin_nmax; y_i = *my_x; my_x += bin_nmax; z_i = *my_x; } else i = 2 * _nall; __syncthreads(); int jnum = 0; int itype; if(i < natoms) { jnum = 0; _ilist[i] = i; itype = _type[i]; } //__syncthreads(); for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) { int otherActIdx = threadIdx.x + otherActOffset; if(otherActIdx < bin_c) { if(otherActOffset == actOffset) { other_id[threadIdx.x] = i; other_x[threadIdx.x] = x_i; other_x[threadIdx.x + blockDim.x] = y_i; other_x[threadIdx.x + 2 * blockDim.x] = z_i; } else { other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx]; my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx; other_x[threadIdx.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + blockDim.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x; } } __syncthreads(); int kk = threadIdx.x; for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) { if(i < natoms) { kk++; kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0; int j = other_id[kk]; if(exclude && exclusion(i, j, itype, _type[j])) continue; if(globcutoff < 0) { int jtype = _type[j]; cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } CUDA_FLOAT delx = x_i - other_x[kk]; CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if(jnum < _maxneighbors) { if(block_style) _neighbors[i * _maxneighbors + jnum] = j; else _neighbors[i + jnum * natoms] = j; } ++jnum; } } } __syncthreads(); } for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++) for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++) for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) { if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue; if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue; int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z; if(other_bin == bin) continue; int obin_c = bin_count[other_bin]; for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) { int otherActIdx = otherActOffset + threadIdx.x; if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) { other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx]; my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx; other_x[threadIdx.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + blockDim.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + 2 * blockDim.x] = *my_x; } __syncthreads(); for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) { if(i < natoms) { int j = other_id[k]; if(exclude && exclusion(i, j, itype, _type[j])) continue; if(globcutoff < 0) { int jtype = _type[j]; cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } CUDA_FLOAT delx = x_i - other_x[k]; CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if(jnum < _maxneighbors) { if(block_style) _neighbors[i * _maxneighbors + jnum] = j; else _neighbors[i + jnum * natoms] = j; } ++jnum; } } } __syncthreads(); } } if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum; if(i < natoms) _numneigh[i] = jnum; } } __global__ void FindSpecial(int block_style) { int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int which; int tag_mask = 0; int3 spec_flag; int3 mynspecial = {0, 0, 1}; if(ii >= _nlocal) return; int special_id[CUDA_MAX_NSPECIAL]; int i = _ilist[ii]; if(i >= _nlocal) return; int jnum = _numneigh[i]; if(_special_flag[1] == 0) spec_flag.x = -1; else if(_special_flag[1] == 1) spec_flag.x = 0; else spec_flag.x = 1; if(_special_flag[2] == 0) spec_flag.y = -1; else if(_special_flag[2] == 1) spec_flag.y = 0; else spec_flag.y = 2; if(_special_flag[3] == 0) spec_flag.z = -1; else if(_special_flag[3] == 1) spec_flag.z = 0; else spec_flag.z = 3; mynspecial.x = _nspecial[i]; mynspecial.y = _nspecial[i + _nmax]; mynspecial.z = _nspecial[i + 2 * _nmax]; if(i < _nlocal) { int* list = &_special[i]; for(int k = 0; k < mynspecial.z; k++) { special_id[k] = list[k * _nmax]; tag_mask = tag_mask | special_id[k]; } } for(int k = 0; k < MIN(jnum, _maxneighbors); k++) { int j; if(block_style) j = _neighbors[i * _maxneighbors + k]; else j = _neighbors[i + k * _nlocal]; int tag_j = _tag[j]; which = 0; if((tag_mask & tag_j) == tag_j) { which = find_special(mynspecial, special_id, tag_j, spec_flag); if(which > 0) { if(block_style) _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS); else _neighbors[i + k * _nlocal] = j ^ (which << SBBITS); } else if(which < 0) { if(block_style) _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1]; else _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal]; jnum--; k--; } } } _numneigh[i] = jnum; } __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style) { int bin_dim_z = gridDim.y; CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; int bin_x = blockIdx.x / bin_dim_y; int bin_y = blockIdx.x - bin_x * bin_dim_y; int bin_z = blockIdx.y; int bin_c = bin_count[bin]; CUDA_FLOAT cut; if(globcutoff > 0) cut = globcutoff; int i = _nall; CUDA_FLOAT* my_x; CUDA_FLOAT x_i, y_i, z_i; for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { int actIdx = threadIdx.x + actOffset; CUDA_FLOAT* other_x = shared; int* other_id = (int*) &other_x[3 * blockDim.x]; if(actIdx < bin_c) { i = binned_id[__mul24(bin, bin_nmax) + actIdx]; my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx; x_i = *my_x; my_x += bin_nmax; y_i = *my_x; my_x += bin_nmax; z_i = *my_x; } else i = 2 * _nall; __syncthreads(); int jnum = 0; int jnum_border = 0; int jnum_inner = 0; int i_border = -1; int itype; if(i < _nlocal) { jnum = 0; _ilist[i] = i; itype = _type[i]; } __syncthreads(); for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) { int otherActIdx = threadIdx.x + otherActOffset; if(otherActIdx < bin_c) { if(otherActOffset == actOffset) { other_id[threadIdx.x] = i; other_x[threadIdx.x] = x_i; other_x[threadIdx.x + blockDim.x] = y_i; other_x[threadIdx.x + 2 * blockDim.x] = z_i; } else { other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx]; my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx; other_x[threadIdx.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + blockDim.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x; } } __syncthreads(); int kk = threadIdx.x; for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) { if(i < _nlocal) { kk++; kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0; int j = other_id[kk]; if(globcutoff < 0) { int jtype = _type[j]; cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } CUDA_FLOAT delx = x_i - other_x[kk]; CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if((j >= _nlocal) && (i_border < 0)) i_border = atomicAdd(_inum_border, 1); if(jnum < _maxneighbors) { if(block_style) { _neighbors[i * _maxneighbors + jnum] = j; if(j >= _nlocal) { _neighbors_border[i_border * _maxneighbors + jnum_border] = j; } else { _neighbors_inner[i * _maxneighbors + jnum_inner] = j; } } else { _neighbors[i + jnum * _nlocal] = j; if(j >= _nlocal) { _neighbors_border[i_border + jnum_border * _nlocal] = j; } else { _neighbors_inner[i + jnum_inner * _nlocal] = j; } } } ++jnum; if(j >= _nlocal) jnum_border++; else jnum_inner++; } } } __syncthreads(); } for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++) for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++) for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) { if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue; if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue; int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z; if(other_bin == bin) continue; int obin_c = bin_count[other_bin]; for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) { int otherActIdx = otherActOffset + threadIdx.x; if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) { other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx]; my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx; other_x[threadIdx.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + blockDim.x] = *my_x; my_x += bin_nmax; other_x[threadIdx.x + 2 * blockDim.x] = *my_x; } __syncthreads(); for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) { if(i < _nlocal) { int j = other_id[k]; if(globcutoff < 0) { int jtype = _type[j]; cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } CUDA_FLOAT delx = x_i - other_x[k]; CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if((j >= _nlocal) && (i_border < 0)) i_border = atomicAdd(_inum_border, 1); if(jnum < _maxneighbors) { if(block_style) { _neighbors[i * _maxneighbors + jnum] = j; if(j >= _nlocal) { _neighbors_border[i_border * _maxneighbors + jnum_border] = j; } else { _neighbors_inner[i * _maxneighbors + jnum_inner] = j; } } else { _neighbors[i + jnum * _nlocal] = j; if(j >= _nlocal) { _neighbors_border[i_border + jnum_border * _nlocal] = j; } else { _neighbors_inner[i + jnum_inner * _nlocal] = j; } } } ++jnum; if(j >= _nlocal) jnum_border++; else jnum_inner++; } } } __syncthreads(); } } if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum; if(i < _nlocal) { _numneigh[i] = jnum; _numneigh_inner[i] = jnum_inner; if(i_border >= 0) _numneigh_border[i_border] = jnum_border; if(i_border >= 0) _ilist_border[i_border] = i; } } } __global__ void NeighborBuildFullNsq_Kernel() { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* buffer = (int*) _buffer; if(i < _nlocal) { X_FLOAT* my_x = _x + i; CUDA_FLOAT x_i = *my_x; my_x += _nmax; CUDA_FLOAT y_i = *my_x; my_x += _nmax; CUDA_FLOAT z_i = *my_x; int jnum = 0; int* jlist = _firstneigh[i]; _ilist[i] = i; int itype = _type[i]; __syncthreads(); for(int j = 0; j < _nall; ++j) { my_x = _x + j; CUDA_FLOAT x_j = *my_x; my_x += _nmax; CUDA_FLOAT y_j = *my_x; my_x += _nmax; CUDA_FLOAT z_j = *my_x; CUDA_FLOAT delx = x_i - x_j; CUDA_FLOAT dely = y_i - y_j; CUDA_FLOAT delz = z_i - z_j; CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; int jtype = _type[j]; if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) { if(jnum < _maxneighbors) jlist[jnum] = j; if(i == 151)((int*)_buffer)[jnum + 2] = j; ++jnum; } __syncthreads(); } if(jnum > _maxneighbors) buffer[0] = 0; _numneigh[i] = jnum; if(i == 151)((int*)_buffer)[1] = jnum; } }