1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24 #include <stdio.h>
25 #define MY_PREFIX compute_temp_partial_cuda
26 #include "cuda_shared.h"
27 #include "cuda_common.h"
28
29 #include "crm_cuda_utils.cu"
30
31 #include "compute_temp_partial_cuda_cu.h"
32 #include "compute_temp_partial_cuda_kernel.cu"
33
Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data * sdata)34 void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
35 {
36 int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
37
38 if(sdata->buffersize < size) {
39 MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
40 CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
41 sdata->buffer = CudaWrapper_AllocCudaData(size);
42 sdata->buffersize = size;
43 sdata->buffer_new++;
44 MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
45 }
46
47 cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
48 }
49
Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data * sdata)50 void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
51 {
52 cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
53 cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
54
55 if(sdata->atom.rmass_flag)
56 cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
57
58 cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
59 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
60 cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
61 cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
62 cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
63 }
64
Cuda_ComputeTempPartialCuda_Init(cuda_shared_data * sdata)65 void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
66 {
67 Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
68 }
69
70
Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data * sdata,int groupbit,ENERGY_FLOAT * t,int xflag,int yflag,int zflag)71 void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
72 {
73 //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
74 Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
75 //if(sdata->atom.update_nlocal)
76 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
77 //if(sdata->buffer_new)
78 Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
79
80 int3 layout = getgrid(sdata->atom.nlocal);
81 dim3 threads(layout.z, 1, 1);
82 dim3 grid(layout.x, layout.y, 1);
83
84 if(sdata->atom.nlocal > 0) {
85 Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
86 cudaThreadSynchronize();
87 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
88
89 int oldgrid = grid.x * grid.y;
90 grid.x = 6;
91 threads.x = 512;
92 Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
93 cudaThreadSynchronize();
94 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
95 }
96 }
97
Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data * sdata,int groupbit,ENERGY_FLOAT * t,int xflag,int yflag,int zflag)98 void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
99 {
100 //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
101 Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
102 //if(sdata->atom.update_nlocal)
103 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
104 //if(sdata->buffer_new)
105 Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
106 MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
107 int3 layout = getgrid(sdata->atom.nlocal);
108 dim3 threads(layout.z, 1, 1);
109 dim3 grid(layout.x, layout.y, 1);
110
111 if(sdata->atom.nlocal > 0) {
112 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
113 Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
114 cudaThreadSynchronize();
115 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
116
117 int oldgrid = grid.x * grid.y;
118 grid.x = 1;
119 threads.x = 512;
120 Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
121 cudaThreadSynchronize();
122 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
123 }
124 }
125
Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data * sdata,int groupbit,int xflag,int yflag,int zflag,void * vbiasall)126 void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
127 {
128 //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
129 Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
130 //if(sdata->atom.update_nlocal)
131 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
132 //if(sdata->buffer_new)
133 Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
134
135 int3 layout = getgrid(sdata->atom.nlocal);
136 dim3 threads(layout.z, 1, 1);
137 dim3 grid(layout.x, layout.y, 1);
138
139 if(sdata->atom.nlocal > 0) {
140 Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
141 cudaThreadSynchronize();
142 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
143 }
144 }
145
Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data * sdata,int groupbit,int xflag,int yflag,int zflag,void * vbiasall)146 void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
147 {
148 //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
149 Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
150 //if(sdata->atom.update_nlocal)
151 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
152 //if(sdata->buffer_new)
153 Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
154
155 int3 layout = getgrid(sdata->atom.nlocal);
156 dim3 threads(layout.z, 1, 1);
157 dim3 grid(layout.x, layout.y, 1);
158
159 if(sdata->atom.nlocal > 0) {
160 Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
161 cudaThreadSynchronize();
162 CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
163 }
164 }
165