1 /* ---------------------------------------------------------------------- 2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 3 4 Original Version: 5 http://lammps.sandia.gov, Sandia National Laboratories 6 Steve Plimpton, sjplimp@sandia.gov 7 8 See the README file in the top-level LAMMPS directory. 9 10 ----------------------------------------------------------------------- 11 12 USER-CUDA Package and associated modifications: 13 https://sourceforge.net/projects/lammpscuda/ 14 15 Christian Trott, christian.trott@tu-ilmenau.de 16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de 17 Theoretical Physics II, University of Technology Ilmenau, Germany 18 19 See the README file in the USER-CUDA directory. 20 21 This software is distributed under the GNU General Public License. 22 ------------------------------------------------------------------------- */ 23 24 extern __shared__ F_FLOAT sharedmem[]; 25 26 Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)27__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) 28 { 29 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; 30 sharedmem[threadIdx.x] = 0; 31 sharedmem[threadIdx.x + blockDim.x] = 0; 32 sharedmem[threadIdx.x + 2 * blockDim.x] = 0; 33 34 if(i < _nlocal) 35 if(_mask[i] & groupbit) { 36 sharedmem[threadIdx.x] = _f[i]; 37 sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax]; 38 sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax]; 39 40 _f[i] = F_F(0.0); 41 _f[i + 1 * _nmax] = F_F(0.0); 42 _f[i + 2 * _nmax] = F_F(0.0); 43 _torque[i] = F_F(0.0); 44 _torque[i + 1 * _nmax] = F_F(0.0); 45 _torque[i + 2 * _nmax] = F_F(0.0); 46 } 47 48 49 reduceBlock(sharedmem); 50 reduceBlock(&sharedmem[blockDim.x]); 51 reduceBlock(&sharedmem[2 * blockDim.x]); 52 F_FLOAT* buffer = (F_FLOAT*)_buffer; 53 54 if(threadIdx.x == 0) { 55 buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; 56 buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; 57 buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; 58 } 59 } 60 61 Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT * foriginal)62__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal) 63 { 64 int i = 0; 65 sharedmem[threadIdx.x] = 0; 66 F_FLOAT myforig = 0.0; 67 F_FLOAT* buf = (F_FLOAT*)_buffer; 68 buf = &buf[blockIdx.x * n]; 69 70 while(i < n) { 71 sharedmem[threadIdx.x] = 0; 72 73 if(i + threadIdx.x < n) 74 sharedmem[threadIdx.x] = buf[i + threadIdx.x]; 75 76 __syncthreads(); 77 reduceBlock(sharedmem); 78 i += blockDim.x; 79 80 if(threadIdx.x == 0) 81 myforig += sharedmem[0]; 82 } 83 84 if(threadIdx.x == 0) 85 foriginal[blockIdx.x] = myforig; 86 } 87 88