1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 extern __shared__ F_FLOAT sharedmem[];
25 
26 
Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)27 __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
28 {
29   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
30   sharedmem[threadIdx.x] = 0;
31   sharedmem[threadIdx.x + blockDim.x] = 0;
32   sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
33 
34   if(i < _nlocal)
35     if(_mask[i] & groupbit) {
36       sharedmem[threadIdx.x] = _f[i];
37       sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
38       sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
39 
40       _f[i] = F_F(0.0);
41       _f[i + 1 * _nmax] = F_F(0.0);
42       _f[i + 2 * _nmax] = F_F(0.0);
43       _torque[i] = F_F(0.0);
44       _torque[i + 1 * _nmax] = F_F(0.0);
45       _torque[i + 2 * _nmax] = F_F(0.0);
46     }
47 
48 
49   reduceBlock(sharedmem);
50   reduceBlock(&sharedmem[blockDim.x]);
51   reduceBlock(&sharedmem[2 * blockDim.x]);
52   F_FLOAT* buffer = (F_FLOAT*)_buffer;
53 
54   if(threadIdx.x == 0) {
55     buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
56     buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
57     buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
58   }
59 }
60 
61 
Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT * foriginal)62 __global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
63 {
64   int i = 0;
65   sharedmem[threadIdx.x] = 0;
66   F_FLOAT myforig = 0.0;
67   F_FLOAT* buf = (F_FLOAT*)_buffer;
68   buf = &buf[blockIdx.x * n];
69 
70   while(i < n) {
71     sharedmem[threadIdx.x] = 0;
72 
73     if(i + threadIdx.x < n)
74       sharedmem[threadIdx.x] = buf[i + threadIdx.x];
75 
76     __syncthreads();
77     reduceBlock(sharedmem);
78     i += blockDim.x;
79 
80     if(threadIdx.x == 0)
81       myforig += sharedmem[0];
82   }
83 
84   if(threadIdx.x == 0)
85     foriginal[blockIdx.x] = myforig;
86 }
87 
88