1 /* ---------------------------------------------------------------------- 2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 3 4 Original Version: 5 http://lammps.sandia.gov, Sandia National Laboratories 6 Steve Plimpton, sjplimp@sandia.gov 7 8 See the README file in the top-level LAMMPS directory. 9 10 ----------------------------------------------------------------------- 11 12 USER-CUDA Package and associated modifications: 13 https://sourceforge.net/projects/lammpscuda/ 14 15 Christian Trott, christian.trott@tu-ilmenau.de 16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de 17 Theoretical Physics II, University of Technology Ilmenau, Germany 18 19 See the README file in the USER-CUDA directory. 20 21 This software is distributed under the GNU General Public License. 22 ------------------------------------------------------------------------- */ 23 24 #ifndef _CUDA_SHARED_H_ 25 #define _CUDA_SHARED_H_ 26 #include "cuda_precision.h" 27 28 #define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) 29 30 struct dev_array { 31 void* dev_data; // pointer to memory address on cuda device 32 unsigned dim[3]; // array dimensions 33 }; 34 35 struct cuda_shared_atom { // relevent data from atom class 36 dev_array dx; // cumulated distance for binning settings 37 dev_array x; // position 38 dev_array v; // velocity 39 dev_array f; // force 40 dev_array tag; 41 dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) 42 dev_array mask; 43 dev_array image; 44 dev_array q; // charges 45 dev_array mass; // per-type masses 46 dev_array rmass; // per-atom masses 47 dev_array radius; // per-atom radius 48 dev_array density; 49 dev_array omega; 50 dev_array torque; 51 dev_array molecule; 52 53 dev_array special; 54 int maxspecial; 55 dev_array nspecial; 56 int* special_flag; 57 int molecular; 58 59 dev_array eatom; // per-atom energy 60 dev_array vatom; // per-atom virial 61 int need_eatom; 62 int need_vatom; 63 64 dev_array x_type; // position + type in X_FLOAT4 struct 65 dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style 66 dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style 67 68 double* mass_host; // remember per-type host pointer to masses 69 //int natoms; // total # of atoms in system, could be 0 70 int nghost; // and ghost atoms on this proc 71 int nlocal; // # of owned 72 int nall; // total # of atoms in this proc 73 int nmax; // max # of owned+ghost in arrays on this proc 74 int ntypes; 75 int q_flag; // do we have charges? 76 int rmass_flag; // do we have per-atom masses? 77 int firstgroup; 78 int nfirst; 79 80 int update_nlocal; 81 int update_nmax; 82 int update_neigh; 83 84 dev_array xhold; // position at last neighboring 85 X_FLOAT triggerneighsq; // maximum square movement before reneighboring 86 int reneigh_flag; // is reneighboring necessary 87 int maxhold; // size of xhold 88 int dist_check; //perform distance check for reneighboring 89 dev_array binned_id; //id of each binned atom (not tag!!) 90 dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] 91 float bin_extraspace; 92 int bin_dim[3]; 93 int bin_nmax; 94 dev_array map_array; 95 }; 96 97 struct cuda_shared_pair { // relevent data from pair class 98 char cudable_force; // check for (cudable_force!=0) 99 X_FLOAT cut_global; 100 X_FLOAT cut_inner_global; 101 X_FLOAT cut_coul_global; 102 double** cut; // type-type cutoff 103 double** cutsq; // type-type cutoff 104 double** cut_inner; // type-type cutoff for coul 105 double** cut_coul; // type-type cutoff for coul 106 double** coeff1; // tpye-type pair parameters 107 double** coeff2; 108 double** coeff3; 109 double** coeff4; 110 double** coeff5; 111 double** coeff6; 112 double** coeff7; 113 double** coeff8; 114 double** coeff9; 115 double** coeff10; 116 double** offset; 117 double* special_lj; 118 double* special_coul; 119 dev_array virial; // ENERGY_FLOAT 120 dev_array eng_vdwl; // ENERGY_FLOAT 121 dev_array eng_coul; // ENERGY_FLOAT 122 X_FLOAT cut_coulsq_global; 123 F_FLOAT g_ewald, kappa; 124 int freeze_group_bit; 125 126 dev_array coeff1_gm; 127 dev_array coeff2_gm; 128 dev_array coeff3_gm; 129 dev_array coeff4_gm; 130 dev_array coeff5_gm; 131 dev_array coeff6_gm; 132 dev_array coeff7_gm; 133 dev_array coeff8_gm; 134 dev_array coeff9_gm; 135 dev_array coeff10_gm; 136 137 int lastgridsize; 138 int n_energy_virial; 139 int collect_forces_later; 140 int use_block_per_atom; 141 int override_block_per_atom; 142 bool neighall; 143 144 }; 145 146 struct cuda_shared_domain { // relevent data from domain class 147 X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc 148 X_FLOAT subhi[3]; 149 X_FLOAT boxlo[3]; 150 X_FLOAT boxhi[3]; 151 X_FLOAT prd[3]; 152 int periodicity[3]; // xyz periodicity as array 153 154 int triclinic; 155 X_FLOAT xy; 156 X_FLOAT xz; 157 X_FLOAT yz; 158 X_FLOAT boxlo_lamda[3]; 159 X_FLOAT boxhi_lamda[3]; 160 X_FLOAT prd_lamda[3]; 161 X_FLOAT h[6]; 162 X_FLOAT h_inv[6]; 163 V_FLOAT h_rate[6]; 164 int update; 165 }; 166 167 struct cuda_shared_pppm { 168 char cudable_force; 169 #ifdef FFT_CUFFT 170 FFT_FLOAT* work1; 171 FFT_FLOAT* work2; 172 FFT_FLOAT* work3; 173 PPPM_FLOAT* greensfn; 174 PPPM_FLOAT* fkx; 175 PPPM_FLOAT* fky; 176 PPPM_FLOAT* fkz; 177 PPPM_FLOAT* vg; 178 #endif 179 int* part2grid; 180 PPPM_FLOAT* density_brick; 181 int* density_brick_int; 182 PPPM_FLOAT density_intScale; 183 PPPM_FLOAT* vdx_brick; 184 PPPM_FLOAT* vdy_brick; 185 PPPM_FLOAT* vdz_brick; 186 PPPM_FLOAT* density_fft; 187 ENERGY_FLOAT* energy; 188 ENERGY_FLOAT* virial; 189 int nxlo_in; 190 int nxhi_in; 191 int nxlo_out; 192 int nxhi_out; 193 int nylo_in; 194 int nyhi_in; 195 int nylo_out; 196 int nyhi_out; 197 int nzlo_in; 198 int nzhi_in; 199 int nzlo_out; 200 int nzhi_out; 201 int nx_pppm; 202 int ny_pppm; 203 int nz_pppm; 204 PPPM_FLOAT qqrd2e; 205 int order; 206 // float3 sublo; 207 PPPM_FLOAT* rho_coeff; 208 int nmax; 209 int nlocal; 210 PPPM_FLOAT* debugdata; 211 PPPM_FLOAT delxinv; 212 PPPM_FLOAT delyinv; 213 PPPM_FLOAT delzinv; 214 int nlower; 215 int nupper; 216 PPPM_FLOAT shiftone; 217 PPPM_FLOAT3* fH; 218 }; 219 220 struct cuda_shared_comm { 221 int maxswap; 222 int maxlistlength; 223 dev_array pbc; 224 dev_array slablo; 225 dev_array slabhi; 226 dev_array multilo; 227 dev_array multihi; 228 dev_array sendlist; 229 int grow_flag; 230 int comm_phase; 231 232 int nsend; 233 int* nsend_swap; 234 int* send_size; 235 int* recv_size; 236 double** buf_send; 237 void** buf_send_dev; 238 double** buf_recv; 239 void** buf_recv_dev; 240 void* buffer; 241 int buffer_size; 242 double overlap_split_ratio; 243 }; 244 245 struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data 246 int maxlocal; 247 int inum; // # of I atoms neighbors are stored for local indices of I atoms 248 int inum_border2; 249 dev_array inum_border; // # of atoms which interact with border atoms 250 dev_array ilist; 251 dev_array ilist_border; 252 dev_array numneigh; 253 dev_array numneigh_inner; 254 dev_array numneigh_border; 255 dev_array firstneigh; 256 dev_array neighbors; 257 dev_array neighbors_border; 258 dev_array neighbors_inner; 259 int maxpage; 260 dev_array page_pointers; 261 dev_array* pages; 262 int maxneighbors; 263 int neigh_lists_per_page; 264 double** cutneighsq; 265 CUDA_FLOAT* cu_cutneighsq; 266 int* binned_id; 267 int* bin_dim; 268 int bin_nmax; 269 float bin_extraspace; 270 double maxcut; 271 dev_array ex_type; 272 int nex_type; 273 dev_array ex1_bit; 274 dev_array ex2_bit; 275 int nex_group; 276 dev_array ex_mol_bit; 277 int nex_mol; 278 279 }; 280 281 struct cuda_compile_settings { // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files 282 int prec_glob; 283 int prec_x; 284 int prec_v; 285 int prec_f; 286 int prec_pppm; 287 int prec_fft; 288 int cufft; 289 int arch; 290 }; 291 292 struct cuda_timings_struct { 293 //Debug: 294 double test1; 295 double test2; 296 //transfers 297 double transfer_upload_tmp_constr; 298 double transfer_download_tmp_deconstr; 299 300 //communication 301 double comm_forward_total; 302 double comm_forward_mpi_upper; 303 double comm_forward_mpi_lower; 304 double comm_forward_kernel_pack; 305 double comm_forward_kernel_unpack; 306 double comm_forward_kernel_self; 307 double comm_forward_upload; 308 double comm_forward_download; 309 310 double comm_exchange_total; 311 double comm_exchange_mpi; 312 double comm_exchange_kernel_pack; 313 double comm_exchange_kernel_unpack; 314 double comm_exchange_kernel_fill; 315 double comm_exchange_cpu_pack; 316 double comm_exchange_upload; 317 double comm_exchange_download; 318 319 double comm_border_total; 320 double comm_border_mpi; 321 double comm_border_kernel_pack; 322 double comm_border_kernel_unpack; 323 double comm_border_kernel_self; 324 double comm_border_kernel_buildlist; 325 double comm_border_upload; 326 double comm_border_download; 327 328 //pair forces 329 double pair_xtype_conversion; 330 double pair_kernel; 331 double pair_virial; 332 double pair_force_collection; 333 334 //neighbor 335 double neigh_bin; 336 double neigh_build; 337 double neigh_special; 338 339 //PPPM 340 double pppm_particle_map; 341 double pppm_make_rho; 342 double pppm_brick2fft; 343 double pppm_poisson; 344 double pppm_fillbrick; 345 double pppm_fieldforce; 346 double pppm_compute; 347 348 }; 349 350 struct cuda_shared_data { // holds space for all relevent data from the different classes 351 void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] 352 int buffersize; //maxsize of buffer 353 int buffer_new; //should be 1 if the pointer to buffer has changed 354 void* flag; 355 void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array 356 cuda_shared_atom atom; 357 cuda_shared_pair pair; 358 cuda_shared_domain domain; 359 cuda_shared_pppm pppm; 360 cuda_shared_comm comm; 361 cuda_compile_settings compile_settings; 362 cuda_timings_struct cuda_timings; 363 int exchange_dim; 364 int me; //mpi rank 365 unsigned int datamask; 366 int overlap_comm; 367 }; 368 369 370 #endif // #ifndef _CUDA_SHARED_H_ 371