1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 #include <stdio.h>
25 #define MY_PREFIX compute_temp_partial_cuda
26 #include "cuda_shared.h"
27 #include "cuda_common.h"
28 
29 #include "crm_cuda_utils.cu"
30 
31 #include "compute_temp_partial_cuda_cu.h"
32 #include "compute_temp_partial_cuda_kernel.cu"
33 
Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data * sdata)34 void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
35 {
36   int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
37 
38   if(sdata->buffersize < size) {
39     MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
40     CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
41     sdata->buffer = CudaWrapper_AllocCudaData(size);
42     sdata->buffersize = size;
43     sdata->buffer_new++;
44     MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
45   }
46 
47   cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
48 }
49 
Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data * sdata)50 void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
51 {
52   cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
53   cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
54 
55   if(sdata->atom.rmass_flag)
56     cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
57 
58   cudaMemcpyToSymbol(MY_AP(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int));
59   cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
60   cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
61   cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
62   cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
63 }
64 
Cuda_ComputeTempPartialCuda_Init(cuda_shared_data * sdata)65 void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
66 {
67   Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
68 }
69 
70 
Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data * sdata,int groupbit,ENERGY_FLOAT * t,int xflag,int yflag,int zflag)71 void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
72 {
73   //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
74   Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
75   //if(sdata->atom.update_nlocal)
76   cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
77   //if(sdata->buffer_new)
78   Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
79 
80   int3 layout = getgrid(sdata->atom.nlocal);
81   dim3 threads(layout.z, 1, 1);
82   dim3 grid(layout.x, layout.y, 1);
83 
84   if(sdata->atom.nlocal > 0) {
85     Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
86     cudaThreadSynchronize();
87     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
88 
89     int oldgrid = grid.x * grid.y;
90     grid.x = 6;
91     threads.x = 512;
92     Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
93     cudaThreadSynchronize();
94     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
95   }
96 }
97 
Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data * sdata,int groupbit,ENERGY_FLOAT * t,int xflag,int yflag,int zflag)98 void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
99 {
100   //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
101   Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
102   //if(sdata->atom.update_nlocal)
103   cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
104   //if(sdata->buffer_new)
105   Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
106   MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
107   int3 layout = getgrid(sdata->atom.nlocal);
108   dim3 threads(layout.z, 1, 1);
109   dim3 grid(layout.x, layout.y, 1);
110 
111   if(sdata->atom.nlocal > 0) {
112     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
113     Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
114     cudaThreadSynchronize();
115     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
116 
117     int oldgrid = grid.x * grid.y;
118     grid.x = 1;
119     threads.x = 512;
120     Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
121     cudaThreadSynchronize();
122     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
123   }
124 }
125 
Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data * sdata,int groupbit,int xflag,int yflag,int zflag,void * vbiasall)126 void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
127 {
128   //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
129   Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
130   //if(sdata->atom.update_nlocal)
131   cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
132   //if(sdata->buffer_new)
133   Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
134 
135   int3 layout = getgrid(sdata->atom.nlocal);
136   dim3 threads(layout.z, 1, 1);
137   dim3 grid(layout.x, layout.y, 1);
138 
139   if(sdata->atom.nlocal > 0) {
140     Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
141     cudaThreadSynchronize();
142     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
143   }
144 }
145 
Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data * sdata,int groupbit,int xflag,int yflag,int zflag,void * vbiasall)146 void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
147 {
148   //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
149   Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
150   //if(sdata->atom.update_nlocal)
151   cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
152   //if(sdata->buffer_new)
153   Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
154 
155   int3 layout = getgrid(sdata->atom.nlocal);
156   dim3 threads(layout.z, 1, 1);
157   dim3 grid(layout.x, layout.y, 1);
158 
159   if(sdata->atom.nlocal > 0) {
160     Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
161     cudaThreadSynchronize();
162     CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
163   }
164 }
165