1 /***************************************************************************
2                                 base_three.h
3                              -------------------
4                             W. Michael Brown (ORNL)
5 
6   Base class for 3-body potentials
7 
8  __________________________________________________________________________
9     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10  __________________________________________________________________________
11 
12     begin                : Tue April 2, 2013
13     email                : brownw@ornl.gov
14  ***************************************************************************/
15 
16 #ifndef LAL_BASE_ATOMIC_H
17 #define LAL_BASE_ATOMIC_H
18 
19 #include "lal_device.h"
20 #include "lal_balance.h"
21 #include "mpi.h"
22 
23 #if defined(USE_OPENCL)
24 #include "geryon/ocl_texture.h"
25 #elif defined(USE_CUDART)
26 #include "geryon/nvc_texture.h"
27 #else
28 #include "geryon/nvd_texture.h"
29 #endif
30 
31 namespace LAMMPS_AL {
32 
33 template <class numtyp, class acctyp>
34 class BaseThree {
35  public:
36   BaseThree();
37   virtual ~BaseThree();
38 
39   /// Clear any previous data and set up for a new LAMMPS run
40   /** \param max_nbors initial number of rows in the neighbor matrix
41     * \param cell_size cutoff + skin
42     * \param gpu_split fraction of particles handled by device
43     * \param k_two name for the kernel for 2-body force calculation
44     * \param k_three name for the kernel for 3-body force calculation
45     *
46     * Returns:
47     * -  0 if successfull
48     * - -1 if fix gpu not found
49     * - -3 if there is an out of memory error
50     * - -4 if the GPU library was not compiled for GPU
51     * - -5 Double precision is not supported on card
52     * - -10 if invalid thread_per_atom setting **/
53   int init_three(const int nlocal, const int nall, const int max_nbors,
54                  const int maxspecial, const double cell_size,
55                  const double gpu_split, FILE *screen,
56                  const void *pair_program, const char *k_two,
57                  const char *k_three_center, const char *k_three_end);
58 
59   /// Estimate the overhead for GPU context changes and CPU driver
60   void estimate_gpu_overhead();
61 
62   /// Check if there is enough storage for atom arrays and realloc if not
63   /** \param success set to false if insufficient memory **/
resize_atom(const int inum,const int nall,bool & success)64   inline void resize_atom(const int inum, const int nall, bool &success) {
65     if (atom->resize(nall, success))
66       pos_tex.bind_float(atom->x,4);
67     ans->resize(inum,success);
68     #ifdef THREE_CONCURRENT
69     ans2->resize(inum,success);
70     #endif
71   }
72 
73   /// Check if there is enough storage for neighbors and realloc if not
74   /** \param nlocal number of particles whose nbors must be stored on device
75     * \param host_inum number of particles whose nbors need to copied to host
76     * \param current maximum number of neighbors
77     * \note olist_size=total number of local particles **/
resize_local(const int inum,const int max_nbors,bool & success)78   inline void resize_local(const int inum, const int max_nbors, bool &success) {
79     nbor->resize(inum,max_nbors,success);
80   }
81 
82   /// Check if there is enough storage for neighbors and realloc if not
83   /** \param nlocal number of particles whose nbors must be stored on device
84     * \param host_inum number of particles whose nbors need to copied to host
85     * \param current maximum number of neighbors
86     * \note host_inum is 0 if the host is performing neighboring
87     * \note nlocal+host_inum=total number local particles
88     * \note olist_size=0 **/
resize_local(const int inum,const int host_inum,const int max_nbors,bool & success)89   inline void resize_local(const int inum, const int host_inum,
90                            const int max_nbors, bool &success) {
91     nbor->resize(inum,host_inum,max_nbors,success);
92   }
93 
94   /// Clear all host and device data
95   /** \note This is called at the beginning of the init() routine **/
96   void clear_atomic();
97 
98   /// Returns memory usage on device per atom
99   int bytes_per_atom_atomic(const int max_nbors) const;
100 
101   /// Total host memory used by library for pair style
102   double host_memory_usage_atomic() const;
103 
104   /// Accumulate timers
acc_timers()105   inline void acc_timers() {
106     if (device->time_device()) {
107       nbor->acc_timers();
108       time_pair.add_to_total();
109       atom->acc_timers();
110       ans->acc_timers();
111       #ifdef THREE_CONCURRENT
112       ans2->acc_timers();
113       #endif
114     }
115   }
116 
117   /// Zero timers
zero_timers()118   inline void zero_timers() {
119     time_pair.zero();
120     atom->zero_timers();
121     ans->zero_timers();
122     #ifdef THREE_CONCURRENT
123     ans2->zero_timers();
124     #endif
125   }
126 
127   /// Copy neighbor list from host
128   int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist,
129                     int *numj, int **firstneigh, bool &success);
130 
131   /// Build neighbor list on device
132   void build_nbor_list(const int inum, const int host_inum,
133                        const int nall, double **host_x, int *host_type,
134                        double *sublo, double *subhi, int *tag, int **nspecial,
135                        int **special, bool &success);
136 
137   /// Pair loop with host neighboring
138   void compute(const int f_ago, const int inum_full, const int nall,
139                const int nlist, double **host_x, int *host_type,
140                int *ilist, int *numj, int **firstneigh, const bool eflag,
141                const bool vflag, const bool eatom, const bool vatom,
142                int &host_start, const double cpu_time, bool &success);
143 
144   /// Pair loop with device neighboring
145   int * compute(const int ago, const int inum_full, const int nall,
146                 double **host_x, int *host_type, double *sublo,
147                 double *subhi, int *tag, int **nspecial,
148                 int **special, const bool eflag, const bool vflag,
149                 const bool eatom, const bool vatom, int &host_start,
150                 const double cpu_time, bool &success);
151 
152   /// Pair loop with device neighboring
153   int ** compute(const int ago, const int inum_full,
154                  const int nall, double **host_x, int *host_type, double *sublo,
155                  double *subhi, int *tag, int **nspecial,
156                  int **special, const bool eflag, const bool vflag,
157                  const bool eatom, const bool vatom, int &host_start,
158                  int **ilist, int **numj, const double cpu_time, bool &success);
159 
160   // -------------------------- DEVICE DATA -------------------------
161 
162   /// Device Properties and Atom and Neighbor storage
163   Device<numtyp,acctyp> *device;
164 
165   /// Geryon device
166   UCL_Device *ucl_device;
167 
168   /// Device Timers
169   UCL_Timer time_pair;
170 
171   /// Host device load balancer
172   Balance<numtyp,acctyp> hd_balancer;
173 
174   /// LAMMPS pointer for screen output
175   FILE *screen;
176 
177   // --------------------------- ATOM DATA --------------------------
178 
179   /// Atom Data
180   Atom<numtyp,acctyp> *atom;
181 
182   // ------------------------ FORCE/ENERGY DATA -----------------------
183 
184   Answer<numtyp,acctyp> *ans;
185   #ifdef THREE_CONCURRENT
186   Answer<numtyp,acctyp> *ans2;
187   #endif
188 
189   // --------------------------- NBOR DATA ----------------------------
190 
191   /// Neighbor data
192   Neighbor *nbor;
193 
194   // ------------------------- DEVICE KERNELS -------------------------
195   UCL_Program *pair_program;
196   UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
block_pair()197   inline int block_pair() { return _block_pair; }
block_size()198   inline int block_size() { return _block_size; }
199 
200   // --------------------------- TEXTURES -----------------------------
201   UCL_Texture pos_tex;
202 
203  protected:
204   bool _compiled;
205   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
206   double _max_bytes, _max_an_bytes;
207   double _gpu_overhead, _driver_overhead;
208   UCL_D_Vec<int> *_nbor_data;
209 
210   void compile_kernels(UCL_Device &dev, const void *pair_string,
211                        const char *k_two, const char *k_three_center,
212                        const char *k_three_end);
213 
214   virtual void loop(const bool _eflag, const bool _vflag,
215                     const int evatom) = 0;
216 };
217 
218 }
219 
220 #endif
221 
222