1 /*************************************************************************** 2 base_three.h 3 ------------------- 4 W. Michael Brown (ORNL) 5 6 Base class for 3-body potentials 7 8 __________________________________________________________________________ 9 This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) 10 __________________________________________________________________________ 11 12 begin : Tue April 2, 2013 13 email : brownw@ornl.gov 14 ***************************************************************************/ 15 16 #ifndef LAL_BASE_ATOMIC_H 17 #define LAL_BASE_ATOMIC_H 18 19 #include "lal_device.h" 20 #include "lal_balance.h" 21 #include "mpi.h" 22 23 #if defined(USE_OPENCL) 24 #include "geryon/ocl_texture.h" 25 #elif defined(USE_CUDART) 26 #include "geryon/nvc_texture.h" 27 #else 28 #include "geryon/nvd_texture.h" 29 #endif 30 31 namespace LAMMPS_AL { 32 33 template <class numtyp, class acctyp> 34 class BaseThree { 35 public: 36 BaseThree(); 37 virtual ~BaseThree(); 38 39 /// Clear any previous data and set up for a new LAMMPS run 40 /** \param max_nbors initial number of rows in the neighbor matrix 41 * \param cell_size cutoff + skin 42 * \param gpu_split fraction of particles handled by device 43 * \param k_two name for the kernel for 2-body force calculation 44 * \param k_three name for the kernel for 3-body force calculation 45 * 46 * Returns: 47 * - 0 if successfull 48 * - -1 if fix gpu not found 49 * - -3 if there is an out of memory error 50 * - -4 if the GPU library was not compiled for GPU 51 * - -5 Double precision is not supported on card 52 * - -10 if invalid thread_per_atom setting **/ 53 int init_three(const int nlocal, const int nall, const int max_nbors, 54 const int maxspecial, const double cell_size, 55 const double gpu_split, FILE *screen, 56 const void *pair_program, const char *k_two, 57 const char *k_three_center, const char *k_three_end); 58 59 /// Estimate the overhead for GPU context changes and CPU driver 60 void estimate_gpu_overhead(); 61 62 /// Check if there is enough storage for atom arrays and realloc if not 63 /** \param success set to false if insufficient memory **/ resize_atom(const int inum,const int nall,bool & success)64 inline void resize_atom(const int inum, const int nall, bool &success) { 65 if (atom->resize(nall, success)) 66 pos_tex.bind_float(atom->x,4); 67 ans->resize(inum,success); 68 #ifdef THREE_CONCURRENT 69 ans2->resize(inum,success); 70 #endif 71 } 72 73 /// Check if there is enough storage for neighbors and realloc if not 74 /** \param nlocal number of particles whose nbors must be stored on device 75 * \param host_inum number of particles whose nbors need to copied to host 76 * \param current maximum number of neighbors 77 * \note olist_size=total number of local particles **/ resize_local(const int inum,const int max_nbors,bool & success)78 inline void resize_local(const int inum, const int max_nbors, bool &success) { 79 nbor->resize(inum,max_nbors,success); 80 } 81 82 /// Check if there is enough storage for neighbors and realloc if not 83 /** \param nlocal number of particles whose nbors must be stored on device 84 * \param host_inum number of particles whose nbors need to copied to host 85 * \param current maximum number of neighbors 86 * \note host_inum is 0 if the host is performing neighboring 87 * \note nlocal+host_inum=total number local particles 88 * \note olist_size=0 **/ resize_local(const int inum,const int host_inum,const int max_nbors,bool & success)89 inline void resize_local(const int inum, const int host_inum, 90 const int max_nbors, bool &success) { 91 nbor->resize(inum,host_inum,max_nbors,success); 92 } 93 94 /// Clear all host and device data 95 /** \note This is called at the beginning of the init() routine **/ 96 void clear_atomic(); 97 98 /// Returns memory usage on device per atom 99 int bytes_per_atom_atomic(const int max_nbors) const; 100 101 /// Total host memory used by library for pair style 102 double host_memory_usage_atomic() const; 103 104 /// Accumulate timers acc_timers()105 inline void acc_timers() { 106 if (device->time_device()) { 107 nbor->acc_timers(); 108 time_pair.add_to_total(); 109 atom->acc_timers(); 110 ans->acc_timers(); 111 #ifdef THREE_CONCURRENT 112 ans2->acc_timers(); 113 #endif 114 } 115 } 116 117 /// Zero timers zero_timers()118 inline void zero_timers() { 119 time_pair.zero(); 120 atom->zero_timers(); 121 ans->zero_timers(); 122 #ifdef THREE_CONCURRENT 123 ans2->zero_timers(); 124 #endif 125 } 126 127 /// Copy neighbor list from host 128 int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist, 129 int *numj, int **firstneigh, bool &success); 130 131 /// Build neighbor list on device 132 void build_nbor_list(const int inum, const int host_inum, 133 const int nall, double **host_x, int *host_type, 134 double *sublo, double *subhi, int *tag, int **nspecial, 135 int **special, bool &success); 136 137 /// Pair loop with host neighboring 138 void compute(const int f_ago, const int inum_full, const int nall, 139 const int nlist, double **host_x, int *host_type, 140 int *ilist, int *numj, int **firstneigh, const bool eflag, 141 const bool vflag, const bool eatom, const bool vatom, 142 int &host_start, const double cpu_time, bool &success); 143 144 /// Pair loop with device neighboring 145 int * compute(const int ago, const int inum_full, const int nall, 146 double **host_x, int *host_type, double *sublo, 147 double *subhi, int *tag, int **nspecial, 148 int **special, const bool eflag, const bool vflag, 149 const bool eatom, const bool vatom, int &host_start, 150 const double cpu_time, bool &success); 151 152 /// Pair loop with device neighboring 153 int ** compute(const int ago, const int inum_full, 154 const int nall, double **host_x, int *host_type, double *sublo, 155 double *subhi, int *tag, int **nspecial, 156 int **special, const bool eflag, const bool vflag, 157 const bool eatom, const bool vatom, int &host_start, 158 int **ilist, int **numj, const double cpu_time, bool &success); 159 160 // -------------------------- DEVICE DATA ------------------------- 161 162 /// Device Properties and Atom and Neighbor storage 163 Device<numtyp,acctyp> *device; 164 165 /// Geryon device 166 UCL_Device *ucl_device; 167 168 /// Device Timers 169 UCL_Timer time_pair; 170 171 /// Host device load balancer 172 Balance<numtyp,acctyp> hd_balancer; 173 174 /// LAMMPS pointer for screen output 175 FILE *screen; 176 177 // --------------------------- ATOM DATA -------------------------- 178 179 /// Atom Data 180 Atom<numtyp,acctyp> *atom; 181 182 // ------------------------ FORCE/ENERGY DATA ----------------------- 183 184 Answer<numtyp,acctyp> *ans; 185 #ifdef THREE_CONCURRENT 186 Answer<numtyp,acctyp> *ans2; 187 #endif 188 189 // --------------------------- NBOR DATA ---------------------------- 190 191 /// Neighbor data 192 Neighbor *nbor; 193 194 // ------------------------- DEVICE KERNELS ------------------------- 195 UCL_Program *pair_program; 196 UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; block_pair()197 inline int block_pair() { return _block_pair; } block_size()198 inline int block_size() { return _block_size; } 199 200 // --------------------------- TEXTURES ----------------------------- 201 UCL_Texture pos_tex; 202 203 protected: 204 bool _compiled; 205 int _block_pair, _block_size, _threads_per_atom, _end_command_queue; 206 double _max_bytes, _max_an_bytes; 207 double _gpu_overhead, _driver_overhead; 208 UCL_D_Vec<int> *_nbor_data; 209 210 void compile_kernels(UCL_Device &dev, const void *pair_string, 211 const char *k_two, const char *k_three_center, 212 const char *k_three_end); 213 214 virtual void loop(const bool _eflag, const bool _vflag, 215 const int evatom) = 0; 216 }; 217 218 } 219 220 #endif 221 222