1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24 enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6};
25 enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL};
26 #define DATA_NONE 0
27 #define DATA_V 1
28 #define DATA_TAG 2
29 #define DATA_RMASS 4
30 #define DATA_MASS 8
31 #define DATA_TORQUE 16
32 #define DATA_OMEGA 32
33 #define DATA_RADIUS 64
34 #define DATA_DENSITY 128
35 #define DATA_MASK 256
36 #define DATA_V_RADIUS 512
37 #define DATA_OMEGA_RMASS 1024
38
39 #define NEIGHMASK 0x3FFFFFFF
40
41 #define MY_PREFIX cuda_pair
42 #define IncludeCommonNeigh
43 #include "cuda_shared.h"
44 #include "cuda_common.h"
45 #include "cuda_wrapper_cu.h"
46 #include "crm_cuda_utils.cu"
47
48 //constants used by multiple forces
49
50 //general
51 #define _cutsq MY_AP(cutsq)
52 #define _offset MY_AP(offset)
53 #define _special_lj MY_AP(special_lj)
54 #define _special_coul MY_AP(special_coul)
55 #define _cutsq_global MY_AP(cutsq_global)
56 #define _collect_forces_later MY_AP(collect_forces_later)
57
58 __device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2];
59 __device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2];
60 __device__ __constant__ F_FLOAT _special_lj[4];
61 __device__ __constant__ F_FLOAT _special_coul[4];
62 __device__ __constant__ X_FLOAT _cutsq_global;
63 __device__ __constant__ int _collect_forces_later;
64
65 __device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
66 __device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
67 __device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
68 __device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
69 __device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
70
71
72 __device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
73 __device__ __constant__ F_FLOAT* MY_AP(coeff2_gm);
74 __device__ __constant__ F_FLOAT* MY_AP(coeff3_gm);
75 __device__ __constant__ F_FLOAT* MY_AP(coeff4_gm);
76 __device__ __constant__ F_FLOAT* MY_AP(coeff5_gm);
77 __device__ __constant__ F_FLOAT* MY_AP(coeff6_gm);
78 __device__ __constant__ F_FLOAT* MY_AP(coeff7_gm);
79 __device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
80 __device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
81 __device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
82
83 #define _coeff1_gm_tex MY_AP(coeff1_gm_tex)
84 #if F_PRECISION == 1
85 texture<float> _coeff1_gm_tex;
86 #else
87 texture<int2, 1> _coeff1_gm_tex;
88 #endif
89
90 #define _coeff2_gm_tex MY_AP(coeff2_gm_tex)
91 #if F_PRECISION == 1
92 texture<float> _coeff2_gm_tex;
93 #else
94 texture<int2, 1> _coeff2_gm_tex;
95 #endif
96
97 #define _coeff3_gm_tex MY_AP(coeff3_gm_tex)
98 #if F_PRECISION == 1
99 texture<float> _coeff3_gm_tex;
100 #else
101 texture<int2, 1> _coeff3_gm_tex;
102 #endif
103
104 #define _coeff4_gm_tex MY_AP(coeff4_gm_tex)
105 #if F_PRECISION == 1
106 texture<float> _coeff4_gm_tex;
107 #else
108 texture<int2, 1> _coeff4_gm_tex;
109 #endif
110
111 #define _coeff5_gm_tex MY_AP(coeff5_gm_tex)
112 #if F_PRECISION == 1
113 texture<float> _coeff5_gm_tex;
114 #else
115 texture<int2, 1> _coeff5_gm_tex;
116 #endif
117
118 #define _coeff6_gm_tex MY_AP(coeff6_gm_tex)
119 #if F_PRECISION == 1
120 texture<float> _coeff6_gm_tex;
121 #else
122 texture<int2, 1> _coeff6_gm_tex;
123 #endif
124
125 #define _coeff7_gm_tex MY_AP(coeff7_gm_tex)
126 #if F_PRECISION == 1
127 texture<float> _coeff7_gm_tex;
128 #else
129 texture<int2, 1> _coeff7_gm_tex;
130 #endif
131
132 #define _coeff8_gm_tex MY_AP(coeff8_gm_tex)
133 #if F_PRECISION == 1
134 texture<float> _coeff8_gm_tex;
135 #else
136 texture<int2, 1> _coeff8_gm_tex;
137 #endif
138
139 #define _coeff9_gm_tex MY_AP(coeff9_gm_tex)
140 #if F_PRECISION == 1
141 texture<float> _coeff9_gm_tex;
142 #else
143 texture<int2, 1> _coeff9_gm_tex;
144 #endif
145
146 #define _coeff10_gm_tex MY_AP(coeff10_gm_tex)
147 #if F_PRECISION == 1
148 texture<float> _coeff10_gm_tex;
149 #else
150 texture<int2, 1> _coeff10_gm_tex;
151 #endif
152
153 //if more than 5 coefficients are needed for a pair potential add them here
154
155
156 //coulomb
157 #define _cut_coulsq MY_AP(cut_coulsq)
158 #define _cut_coulsq_global MY_AP(cut_coulsq_global)
159 #define _g_ewald MY_AP(g_ewald)
160 #define _qqrd2e MY_AP(qqrd2e)
161 #define _kappa MY_AP(kappa)
162 __device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2];
163 __device__ __constant__ X_FLOAT _cut_coulsq_global;
164 __device__ __constant__ F_FLOAT _g_ewald;
165 __device__ __constant__ F_FLOAT _qqrd2e;
166 __device__ __constant__ F_FLOAT _kappa;
167
168 //inner cutoff
169 #define _cut_innersq MY_AP(cut_innersq)
170 #define _cut_innersq_global MY_AP(cut_innersq_global)
171 __device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
172 __device__ __constant__ X_FLOAT _cut_innersq_global;
173
174
175 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
176 __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
177
178 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
179 __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
180
181 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
182 __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
183
184 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
185 __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
186
187 #include <stdio.h>
188 #include "cuda_pair_cu.h"
189 #include "cuda_pair_virial_kernel_nc.cu"
190
191 //Functions which are shared by pair styles
192
193 //Update Buffersize
Cuda_UpdateBuffer(cuda_shared_data * sdata,int size)194 void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size)
195 {
196 CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
197
198 if(sdata->buffersize < size) {
199 MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
200 CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
201 sdata->buffer = CudaWrapper_AllocCudaData(size);
202 sdata->buffersize = size;
203 sdata->buffer_new++;
204 MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
205 }
206
207 cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
208 CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
209 }
210
Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data * sdata,cuda_shared_neighlist * sneighlist)211 void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
212 {
213 //Neighbor
214 cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
215 cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*));
216 cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
217 cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
218 cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
219 cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*));
220 cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
221 cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int));
222
223 if(sdata->overlap_comm) {
224 cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
225 cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
226 cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
227 cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
228 cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
229 cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*));
230 }
231
232 }
233 //Update constants after nmax change which are generally needed by all pair styles
Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data * sdata,cuda_shared_neighlist * sneighlist)234 void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
235 {
236 CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
237
238 //System
239 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
240 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
241 cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
242
243 //Atom
244 cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
245 cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
246 cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
247 cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
248 cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
249 cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
250 cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
251 cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
252
253
254 //Other
255 cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
256 CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
257 }
258
259 //Initialisation of GPU Constants which rarely change
Cuda_Pair_Init_AllStyles(cuda_shared_data * sdata,int ncoeff,bool need_q=false,bool use_global_params=false,bool need_innercut=false,bool need_cut=true)260 void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true)
261 {
262 unsigned cuda_ntypes = sdata->atom.ntypes + 1;
263 unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
264 unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
265 unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
266
267 //check if enough constant memory is available
268 if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
269 printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
270 "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
271 "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
272
273 if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
274 exit(0);
275
276 //type conversion of cutoffs and parameters
277 if(need_cut) {
278 X_FLOAT cutsq[cuda_ntypes2];
279
280 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
281 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
282 cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
283 }
284 }
285
286 int cutsqdiffer = 0;
287 X_FLOAT cutsq_global;
288 cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
289
290 if(sdata->pair.cut) {
291 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
292 for(int j = i; j <= sdata->atom.ntypes; ++j) {
293 if(sdata->pair.cut[i][j] > 1e-6) {
294 cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
295 cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
296 }
297
298 if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
299
300 if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
301 cutsqdiffer++;
302 }
303 }
304 }
305
306 if(sdata->pair.cutsq) {
307 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
308 for(int j = i; j <= sdata->atom.ntypes; ++j) {
309 if(sdata->pair.cut[i][j] > 1e-6) {
310 cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
311 cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
312 }
313
314 if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
315
316 if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
317 cutsqdiffer++;
318 }
319 }
320 }
321
322 //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global);
323 if(cutsqdiffer) {
324
325 cutsq_global = -1.0;
326 cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx);
327 }
328
329 cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
330 }
331
332 if(need_innercut) {
333 X_FLOAT cut_innersq[cuda_ntypes2];
334
335 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
336 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
337 cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
338 }
339 }
340
341 int cutsqdiffer = 0;
342 X_FLOAT cut_innersq_global;
343 cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
344
345 if(sdata->pair.cut_inner) {
346 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
347 for(int j = i; j <= sdata->atom.ntypes; ++j) {
348 if(sdata->pair.cut_inner[i][j] > 1e-6) {
349 cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
350 cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
351 }
352
353 if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
354
355 if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
356 cutsqdiffer++;
357 }
358 }
359 }
360
361 if(cutsqdiffer) {
362 cut_innersq_global = -1.0;
363 cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx);
364 }
365
366 cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT));
367 }
368
369 if(need_q) {
370 X_FLOAT cut_coulsq[cuda_ntypes2];
371
372 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
373 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
374 cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
375 }
376 }
377
378 int cutsqdiffer = 0;
379 X_FLOAT cut_coulsq_global;
380 cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
381
382 if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
383
384 if(sdata->pair.cut_coul) {
385 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
386 for(int j = i; j <= sdata->atom.ntypes; ++j) {
387 if(sdata->pair.cut_coul[i][j] > 1e-6) {
388 cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
389 cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
390 }
391
392 if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
393
394 if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
395 cutsqdiffer++;
396 }
397 }
398 }
399
400 if(cutsqdiffer) {
401 cut_coulsq_global = -1.0;
402 cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx);
403 }
404
405 cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT));
406 }
407
408 CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
409
410 if(ncoeff > 0) {
411 F_FLOAT coeff1[cuda_ntypes2];
412
413 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
414 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
415 coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
416 }
417 }
418
419 if(use_global_params) {
420 cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*));
421 cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
422
423 _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates
424 _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
425 _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
426 const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex);
427 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
428
429 #if F_PRECISION == 1
430 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
431 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
432 cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
433 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
434 #else
435 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
436 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed");
437 cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
438 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
439 #endif
440
441 } else
442 cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
443 }
444
445 CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
446
447 if(ncoeff > 1) {
448 F_FLOAT coeff2[cuda_ntypes2];
449
450 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
451 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
452 coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
453 }
454 }
455
456 if(use_global_params) {
457 cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*));
458 cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
459
460 _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates
461 _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
462 _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
463 const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex);
464
465 #if F_PRECISION == 1
466 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
467 cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
468 #else
469 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
470 cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
471 #endif
472
473 } else
474 cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
475 }
476
477 CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
478
479 if(ncoeff > 2) {
480 F_FLOAT coeff3[cuda_ntypes2];
481
482 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
483 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
484 coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
485 }
486 }
487
488 if(use_global_params) {
489 cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*));
490 cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
491 _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates
492 _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
493 _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
494 const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex);
495
496 #if F_PRECISION == 1
497 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
498 cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
499 #else
500 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
501 cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
502 #endif
503 } else
504 cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
505 }
506
507 CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
508
509 if(ncoeff > 3) {
510 F_FLOAT coeff4[cuda_ntypes2];
511
512 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
513 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
514 coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
515 }
516 }
517
518 if(use_global_params) {
519 cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*));
520 cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
521 _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates
522 _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
523 _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
524 const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex);
525
526 #if F_PRECISION == 1
527 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
528 cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
529 #else
530 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
531 cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
532 #endif
533 } else
534 cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
535 }
536
537 CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
538
539 if(ncoeff > 4) {
540 F_FLOAT coeff5[cuda_ntypes2];
541
542 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
543 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
544 coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
545 }
546 }
547
548 if(use_global_params) {
549 cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*));
550 cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
551 _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates
552 _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
553 _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
554 const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex);
555
556 #if F_PRECISION == 1
557 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
558 cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
559 #else
560 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
561 cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
562 #endif
563 } else
564 cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
565 }
566
567 CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
568
569 if(ncoeff > 5) {
570 F_FLOAT coeff6[cuda_ntypes2];
571
572 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
573 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
574 coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
575 }
576 }
577
578 if(use_global_params) {
579 cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*));
580 cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
581 _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates
582 _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
583 _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
584 const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex);
585
586 #if F_PRECISION == 1
587 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
588 cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
589 #else
590 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
591 cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
592 #endif
593 }
594 }
595
596 CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
597
598 if(ncoeff > 6) {
599 F_FLOAT coeff7[cuda_ntypes2];
600
601 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
602 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
603 coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
604 }
605 }
606
607 if(use_global_params) {
608 cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*));
609 cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
610 _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates
611 _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
612 _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
613 const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex);
614
615 #if F_PRECISION == 1
616 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
617 cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
618 #else
619 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
620 cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
621 #endif
622 }
623 }
624
625 CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
626
627 if(ncoeff > 7) {
628 F_FLOAT coeff8[cuda_ntypes2];
629
630 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
631 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
632 coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
633 }
634 }
635
636 if(use_global_params) {
637 cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*));
638 cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
639 _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates
640 _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
641 _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
642 const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex);
643
644 #if F_PRECISION == 1
645 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
646 cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
647 #else
648 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
649 cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
650 #endif
651 }
652 }
653
654 CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
655
656 if(ncoeff > 8) {
657 F_FLOAT coeff9[cuda_ntypes2];
658
659 for(int i = 1; i <= sdata->atom.ntypes; ++i) {
660 for(int j = 1; j <= sdata->atom.ntypes; ++j) {
661 coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
662 }
663 }
664
665 if(use_global_params) {
666 cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*));
667 cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
668 _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates
669 _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
670 _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
671 const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex);
672
673 #if F_PRECISION == 1
674 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
675 cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
676 #else
677 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
678 cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
679 #endif
680 }
681 }
682
683 CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
684
685 F_FLOAT special_lj[4];
686 special_lj[0] = sdata->pair.special_lj[0];
687 special_lj[1] = sdata->pair.special_lj[1];
688 special_lj[2] = sdata->pair.special_lj[2];
689 special_lj[3] = sdata->pair.special_lj[3];
690
691
692 X_FLOAT box_size[3] = {
693 sdata->domain.subhi[0] - sdata->domain.sublo[0],
694 sdata->domain.subhi[1] - sdata->domain.sublo[1],
695 sdata->domain.subhi[2] - sdata->domain.sublo[2]
696 };
697
698 cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
699 cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
700 cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4);
701 cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
702 cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
703 cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
704 cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
705
706 if(need_q) {
707 F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
708 F_FLOAT special_coul[4];
709 special_coul[0] = sdata->pair.special_coul[0];
710 special_coul[1] = sdata->pair.special_coul[1];
711 special_coul[2] = sdata->pair.special_coul[2];
712 special_coul[3] = sdata->pair.special_coul[3];
713
714 cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4);
715 cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT));
716 cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT));
717 cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT));
718 cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
719 }
720
721 CUT_CHECK_ERROR("Cuda_Pair: init failed");
722 }
723 my_times startpairtime, endpairtime;
724 //Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
Cuda_Pair_PreKernel_AllStyles(cuda_shared_data * sdata,cuda_shared_neighlist * sneighlist,int eflag,int vflag,dim3 & grid,dim3 & threads,int & sharedperproc,bool need_q=false,int maxthreads=256)725 void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
726 {
727 if(sdata->atom.nlocal == 0) return;
728
729 if(sdata->atom.update_neigh)
730 Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist);
731
732 if(sdata->atom.update_nmax)
733 Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist);
734
735 if(sdata->atom.update_nlocal) {
736 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
737 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
738 }
739
740
741
742 BindXTypeTexture(sdata);
743
744 if(need_q) BindQTexture(sdata);
745
746
747 sharedperproc = 0;
748
749 if(sdata->pair.use_block_per_atom) sharedperproc += 3;
750
751 if(eflag) sharedperproc += 1;
752
753 if(need_q && eflag) sharedperproc += 1;
754
755 if(vflag) sharedperproc += 6;
756
757 int threadnum = sneighlist->inum;
758
759 if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2;
760
761 if(sdata->pair.use_block_per_atom) {
762 threadnum *= 64;
763 maxthreads = 64;
764 }
765
766 int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
767 threads.x = layout.z;
768 threads.y = 1;
769 threads.z = 1;
770 grid.x = layout.x;
771 grid.y = layout.y;
772 grid.z = 1;
773
774 int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
775
776 if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
777
778 Cuda_UpdateBuffer(sdata, size);
779
780 if(sdata->pair.use_block_per_atom)
781 cudaMemset(sdata->buffer, 0, size);
782
783 sdata->pair.lastgridsize = grid.x * grid.y;
784 sdata->pair.n_energy_virial = sharedperproc;
785
786 if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
787
788 my_gettime(CLOCK_REALTIME, &startpairtime);
789
790 MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
791 }
792
793 //Function which is called after the kernel invocation, collects energy and virial
Cuda_Pair_PostKernel_AllStyles(cuda_shared_data * sdata,dim3 & grid,int & sharedperproc,int eflag,int vflag)794 void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag)
795 {
796 if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
797 cudaThreadSynchronize();
798 my_gettime(CLOCK_REALTIME, &endpairtime);
799 sdata->cuda_timings.pair_kernel +=
800 endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
801 CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
802
803 if(eflag || vflag) {
804 int n = grid.x * grid.y;
805
806 if(sdata->pair.use_block_per_atom)
807 grid.x = sharedperproc - 3;
808 else
809 grid.x = sharedperproc;
810
811 grid.y = 1;
812 dim3 threads(128, 1, 1);
813 MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
814 MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
815 cudaThreadSynchronize();
816 CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
817 }
818
819 MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");)
820 }
821 }
822
823
824 #include "pair_born_coul_long_cuda.cu"
825 #include "pair_buck_coul_cut_cuda.cu"
826 #include "pair_buck_coul_long_cuda.cu"
827 #include "pair_buck_cuda.cu"
828 #include "pair_lj_sdk_cuda.cu"
829 #include "pair_lj_sdk_coul_cut_cuda.cu"
830 #include "pair_lj_sdk_coul_debye_cuda.cu"
831 #include "pair_lj_sdk_coul_long_cuda.cu"
832 #include "pair_gran_hooke_cuda.cu"
833 #include "pair_lj_charmm_coul_charmm_implicit_cuda.cu"
834 #include "pair_lj_charmm_coul_charmm_cuda.cu"
835 #include "pair_lj_charmm_coul_long_cuda.cu"
836 #include "pair_lj_class2_coul_cut_cuda.cu"
837 #include "pair_lj_class2_coul_long_cuda.cu"
838 #include "pair_lj_class2_cuda.cu"
839 #include "pair_lj_cut_coul_cut_cuda.cu"
840 #include "pair_lj_cut_coul_debye_cuda.cu"
841 #include "pair_lj_cut_coul_long_cuda.cu"
842 #include "pair_lj_cut_cuda.cu"
843 #include "pair_lj_cut_experimental_cuda.cu"
844 #include "pair_lj_expand_cuda.cu"
845 #include "pair_lj_gromacs_cuda.cu"
846 #include "pair_lj_gromacs_coul_gromacs_cuda.cu"
847 #include "pair_lj_smooth_cuda.cu"
848 #include "pair_lj96_cut_cuda.cu"
849 #include "pair_morse_coul_long_cuda.cu"
850 #include "pair_morse_cuda.cu"
851 #include "pair_eam_cuda.cu"
852
853 #include "cuda_pair_kernel.cu"
854
855 #include "pair_manybody_const.h"
856 #include "pair_tersoff_cuda.cu"
857 #include "pair_sw_cuda.cu"
858
Cuda_Pair_UpdateNmax(cuda_shared_data * sdata)859 void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
860 {
861 CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
862 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
863 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
864 cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
865 cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
866 cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
867 cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
868 cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*));
869 cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
870 cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*));
871 cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
872 cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*));
873 cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
874 cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
875 cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
876 CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
877 }
878
879
Cuda_Pair_GenerateXType(cuda_shared_data * sdata)880 void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
881 {
882 MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
883
884 if(sdata->atom.update_nmax)
885 Cuda_Pair_UpdateNmax(sdata);
886
887 if(sdata->atom.update_nlocal) {
888 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
889 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
890 }
891
892 MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);)
893
894 int3 layout = getgrid(sdata->atom.nall);
895 dim3 threads(layout.z, 1, 1);
896 dim3 grid(layout.x, layout.y, 1);
897
898 MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);)
899 Pair_GenerateXType_Kernel <<< grid, threads, 0>>>();
900 cudaThreadSynchronize();
901 CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
902 MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout);)
903 }
904
Cuda_Pair_RevertXType(cuda_shared_data * sdata)905 void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
906 {
907 MYDBG(printf(" # CUDA: RevertXType ... start\n");)
908
909 if(sdata->atom.update_nmax)
910 Cuda_Pair_UpdateNmax(sdata);
911
912 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
913 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
914
915 int3 layout = getgrid(sdata->atom.nall);
916 dim3 threads(layout.z, 1, 1);
917 dim3 grid(layout.x, layout.y, 1);
918
919 Pair_RevertXType_Kernel <<< grid, threads, 0>>>();
920 cudaThreadSynchronize();
921 CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
922 MYDBG(printf(" # CUDA: RevertXType ... end\n");)
923 }
924
Cuda_Pair_GenerateVRadius(cuda_shared_data * sdata)925 void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
926 {
927 MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
928
929 if(sdata->atom.update_nmax)
930 Cuda_Pair_UpdateNmax(sdata);
931
932 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
933 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
934 MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);)
935
936 int3 layout = getgrid(sdata->atom.nall);
937 dim3 threads(layout.z, 1, 1);
938 dim3 grid(layout.x, layout.y, 1);
939
940 MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);)
941 Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>();
942 cudaThreadSynchronize();
943 CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed");
944 MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout);)
945 }
946
Cuda_Pair_GenerateOmegaRmass(cuda_shared_data * sdata)947 void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata)
948 {
949 MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
950
951 if(sdata->atom.update_nmax)
952 Cuda_Pair_UpdateNmax(sdata);
953
954 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
955 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
956 MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);)
957
958 int3 layout = getgrid(sdata->atom.nall);
959 dim3 threads(layout.z, 1, 1);
960 dim3 grid(layout.x, layout.y, 1);
961
962 MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);)
963 Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>();
964 cudaThreadSynchronize();
965 CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed");
966 MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout);)
967 }
968
Cuda_Pair_BuildXHold(cuda_shared_data * sdata)969 void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
970 {
971 if(sdata->atom.update_nmax)
972 Cuda_Pair_UpdateNmax(sdata);
973
974 cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
975 cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
976
977 int3 layout = getgrid(sdata->atom.nall);
978 dim3 threads(layout.z, 1, 1);
979 dim3 grid(layout.x, layout.y, 1);
980
981 Pair_BuildXHold_Kernel <<< grid, threads, 0>>>();
982 cudaThreadSynchronize();
983 CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
984 }
985
Cuda_Pair_CollectForces(cuda_shared_data * sdata,int eflag,int vflag)986 void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
987 {
988 cudaThreadSynchronize();
989 my_gettime(CLOCK_REALTIME, &endpairtime);
990 sdata->cuda_timings.pair_kernel +=
991 endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
992 CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
993 dim3 threads;
994 dim3 grid;
995
996 if(eflag || vflag) {
997 int n = sdata->pair.lastgridsize;
998 grid.x = sdata->pair.n_energy_virial;
999 grid.y = 1;
1000 threads.x = 128;
1001 //printf("A grid.x: %i\n",grid.x);
1002 MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
1003 cudaThreadSynchronize();
1004 CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
1005 }
1006
1007 int3 layout = getgrid(sdata->atom.nlocal);
1008 threads.x = layout.z;
1009 grid.x = layout.x;
1010 grid.y = layout.y;
1011 Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize);
1012 cudaThreadSynchronize();
1013 CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed");
1014
1015 }
1016