1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 #ifndef _CUDA_SHARED_H_
25 #define _CUDA_SHARED_H_
26 #include "cuda_precision.h"
27 
28 #define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
29 
30 struct dev_array {
31   void* dev_data;			// pointer to memory address on cuda device
32   unsigned dim[3];		// array dimensions
33 };
34 
35 struct cuda_shared_atom {	// relevent data from atom class
36   dev_array dx; 			// cumulated distance for binning settings
37   dev_array x;			// position
38   dev_array v;			// velocity
39   dev_array f;			// force
40   dev_array tag;
41   dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)
42   dev_array mask;
43   dev_array image;
44   dev_array q;			// charges
45   dev_array mass;			// per-type masses
46   dev_array rmass;		// per-atom masses
47   dev_array radius;		// per-atom radius
48   dev_array density;
49   dev_array omega;
50   dev_array torque;
51   dev_array molecule;
52 
53   dev_array special;
54   int maxspecial;
55   dev_array nspecial;
56   int* special_flag;
57   int molecular;
58 
59   dev_array eatom;		// per-atom energy
60   dev_array vatom;		// per-atom virial
61   int need_eatom;
62   int need_vatom;
63 
64   dev_array x_type;		// position + type in X_FLOAT4 struct
65   dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
66   dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
67 
68   double* mass_host;		// remember per-type host pointer to masses
69   //int natoms;				// total # of atoms in system, could be 0
70   int nghost;				// and ghost atoms on this proc
71   int nlocal;				// # of owned
72   int nall;			    // total # of atoms in this proc
73   int nmax;				// max # of owned+ghost in arrays on this proc
74   int ntypes;
75   int q_flag;				// do we have charges?
76   int rmass_flag;			// do we have per-atom masses?
77   int firstgroup;
78   int nfirst;
79 
80   int update_nlocal;
81   int update_nmax;
82   int update_neigh;
83 
84   dev_array xhold;	    // position at last neighboring
85   X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
86   int reneigh_flag;		// is reneighboring necessary
87   int maxhold;			// size of xhold
88   int dist_check; 		//perform distance check for reneighboring
89   dev_array binned_id;    //id of each binned atom (not tag!!)
90   dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
91   float bin_extraspace;
92   int bin_dim[3];
93   int bin_nmax;
94   dev_array map_array;
95 };
96 
97 struct cuda_shared_pair {	// relevent data from pair class
98   char cudable_force;		// check for (cudable_force!=0)
99   X_FLOAT cut_global;
100   X_FLOAT cut_inner_global;
101   X_FLOAT cut_coul_global;
102   double** cut;			// type-type cutoff
103   double** cutsq;			// type-type cutoff
104   double** cut_inner;			// type-type cutoff for coul
105   double** cut_coul;			// type-type cutoff for coul
106   double** coeff1;		// tpye-type pair parameters
107   double** coeff2;
108   double** coeff3;
109   double** coeff4;
110   double** coeff5;
111   double** coeff6;
112   double** coeff7;
113   double** coeff8;
114   double** coeff9;
115   double** coeff10;
116   double** offset;
117   double* special_lj;
118   double* special_coul;
119   dev_array virial; // ENERGY_FLOAT
120   dev_array eng_vdwl; // ENERGY_FLOAT
121   dev_array eng_coul; // ENERGY_FLOAT
122   X_FLOAT cut_coulsq_global;
123   F_FLOAT g_ewald, kappa;
124   int freeze_group_bit;
125 
126   dev_array coeff1_gm;
127   dev_array coeff2_gm;
128   dev_array coeff3_gm;
129   dev_array coeff4_gm;
130   dev_array coeff5_gm;
131   dev_array coeff6_gm;
132   dev_array coeff7_gm;
133   dev_array coeff8_gm;
134   dev_array coeff9_gm;
135   dev_array coeff10_gm;
136 
137   int lastgridsize;
138   int n_energy_virial;
139   int collect_forces_later;
140   int use_block_per_atom;
141   int override_block_per_atom;
142   bool neighall;
143 
144 };
145 
146 struct cuda_shared_domain {	// relevent data from domain class
147   X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
148   X_FLOAT subhi[3];
149   X_FLOAT boxlo[3];
150   X_FLOAT boxhi[3];
151   X_FLOAT prd[3];
152   int periodicity[3];		// xyz periodicity as array
153 
154   int triclinic;
155   X_FLOAT xy;
156   X_FLOAT xz;
157   X_FLOAT yz;
158   X_FLOAT boxlo_lamda[3];
159   X_FLOAT boxhi_lamda[3];
160   X_FLOAT prd_lamda[3];
161   X_FLOAT h[6];
162   X_FLOAT h_inv[6];
163   V_FLOAT h_rate[6];
164   int update;
165 };
166 
167 struct cuda_shared_pppm {
168   char cudable_force;
169 #ifdef FFT_CUFFT
170   FFT_FLOAT* work1;
171   FFT_FLOAT* work2;
172   FFT_FLOAT* work3;
173   PPPM_FLOAT* greensfn;
174   PPPM_FLOAT* fkx;
175   PPPM_FLOAT* fky;
176   PPPM_FLOAT* fkz;
177   PPPM_FLOAT* vg;
178 #endif
179   int* part2grid;
180   PPPM_FLOAT* density_brick;
181   int* density_brick_int;
182   PPPM_FLOAT density_intScale;
183   PPPM_FLOAT* vdx_brick;
184   PPPM_FLOAT* vdy_brick;
185   PPPM_FLOAT* vdz_brick;
186   PPPM_FLOAT* density_fft;
187   ENERGY_FLOAT* energy;
188   ENERGY_FLOAT* virial;
189   int nxlo_in;
190   int nxhi_in;
191   int nxlo_out;
192   int nxhi_out;
193   int nylo_in;
194   int nyhi_in;
195   int nylo_out;
196   int nyhi_out;
197   int nzlo_in;
198   int nzhi_in;
199   int nzlo_out;
200   int nzhi_out;
201   int nx_pppm;
202   int ny_pppm;
203   int nz_pppm;
204   PPPM_FLOAT qqrd2e;
205   int order;
206   // float3 sublo;
207   PPPM_FLOAT* rho_coeff;
208   int nmax;
209   int nlocal;
210   PPPM_FLOAT* debugdata;
211   PPPM_FLOAT delxinv;
212   PPPM_FLOAT delyinv;
213   PPPM_FLOAT delzinv;
214   int nlower;
215   int nupper;
216   PPPM_FLOAT shiftone;
217   PPPM_FLOAT3* fH;
218 };
219 
220 struct cuda_shared_comm {
221   int maxswap;
222   int maxlistlength;
223   dev_array pbc;
224   dev_array slablo;
225   dev_array slabhi;
226   dev_array multilo;
227   dev_array multihi;
228   dev_array sendlist;
229   int grow_flag;
230   int comm_phase;
231 
232   int nsend;
233   int* nsend_swap;
234   int* send_size;
235   int* recv_size;
236   double** buf_send;
237   void** buf_send_dev;
238   double** buf_recv;
239   void** buf_recv_dev;
240   void* buffer;
241   int buffer_size;
242   double overlap_split_ratio;
243 };
244 
245 struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
246   int maxlocal;
247   int inum;                // # of I atoms neighbors are stored for local indices of I atoms
248   int inum_border2;
249   dev_array inum_border;         // # of atoms which interact with border atoms
250   dev_array ilist;
251   dev_array ilist_border;
252   dev_array numneigh;
253   dev_array numneigh_inner;
254   dev_array numneigh_border;
255   dev_array firstneigh;
256   dev_array neighbors;
257   dev_array neighbors_border;
258   dev_array neighbors_inner;
259   int maxpage;
260   dev_array page_pointers;
261   dev_array* pages;
262   int maxneighbors;
263   int neigh_lists_per_page;
264   double** cutneighsq;
265   CUDA_FLOAT* cu_cutneighsq;
266   int* binned_id;
267   int* bin_dim;
268   int bin_nmax;
269   float bin_extraspace;
270   double maxcut;
271   dev_array ex_type;
272   int nex_type;
273   dev_array ex1_bit;
274   dev_array ex2_bit;
275   int nex_group;
276   dev_array ex_mol_bit;
277   int nex_mol;
278 
279 };
280 
281 struct cuda_compile_settings {	// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
282   int prec_glob;
283   int prec_x;
284   int prec_v;
285   int prec_f;
286   int prec_pppm;
287   int prec_fft;
288   int cufft;
289   int arch;
290 };
291 
292 struct cuda_timings_struct {
293   //Debug:
294   double test1;
295   double test2;
296   //transfers
297   double transfer_upload_tmp_constr;
298   double transfer_download_tmp_deconstr;
299 
300   //communication
301   double comm_forward_total;
302   double comm_forward_mpi_upper;
303   double comm_forward_mpi_lower;
304   double comm_forward_kernel_pack;
305   double comm_forward_kernel_unpack;
306   double comm_forward_kernel_self;
307   double comm_forward_upload;
308   double comm_forward_download;
309 
310   double comm_exchange_total;
311   double comm_exchange_mpi;
312   double comm_exchange_kernel_pack;
313   double comm_exchange_kernel_unpack;
314   double comm_exchange_kernel_fill;
315   double comm_exchange_cpu_pack;
316   double comm_exchange_upload;
317   double comm_exchange_download;
318 
319   double comm_border_total;
320   double comm_border_mpi;
321   double comm_border_kernel_pack;
322   double comm_border_kernel_unpack;
323   double comm_border_kernel_self;
324   double comm_border_kernel_buildlist;
325   double comm_border_upload;
326   double comm_border_download;
327 
328   //pair forces
329   double pair_xtype_conversion;
330   double pair_kernel;
331   double pair_virial;
332   double pair_force_collection;
333 
334   //neighbor
335   double neigh_bin;
336   double neigh_build;
337   double neigh_special;
338 
339   //PPPM
340   double pppm_particle_map;
341   double pppm_make_rho;
342   double pppm_brick2fft;
343   double pppm_poisson;
344   double pppm_fillbrick;
345   double pppm_fieldforce;
346   double pppm_compute;
347 
348 };
349 
350 struct cuda_shared_data {	// holds space for all relevent data from the different classes
351   void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
352   int buffersize; //maxsize of buffer
353   int buffer_new; //should be 1 if the pointer to buffer has changed
354   void* flag;
355   void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
356   cuda_shared_atom atom;
357   cuda_shared_pair pair;
358   cuda_shared_domain domain;
359   cuda_shared_pppm pppm;
360   cuda_shared_comm comm;
361   cuda_compile_settings compile_settings;
362   cuda_timings_struct cuda_timings;
363   int exchange_dim;
364   int me; //mpi rank
365   unsigned int datamask;
366   int overlap_comm;
367 };
368 
369 
370 #endif // #ifndef _CUDA_SHARED_H_
371