1 /***************************************************************************
2                                    atom.h
3                              -------------------
4                             W. Michael Brown (ORNL)
5 
6   Class for particle data management
7 
8  __________________________________________________________________________
9     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10  __________________________________________________________________________
11 
12     begin                :
13     email                : brownw@ornl.gov
14  ***************************************************************************/
15 
16 #ifndef PAIR_GPU_ATOM_H
17 #define PAIR_GPU_ATOM_H
18 
19 #include <math.h>
20 #include "mpi.h"
21 
22 #if defined(USE_OPENCL)
23 #include "geryon/ocl_timer.h"
24 #include "geryon/ocl_mat.h"
25 #include "geryon/ocl_kernel.h"
26 using namespace ucl_opencl;
27 #elif defined(USE_CUDART)
28 #include "geryon/nvc_timer.h"
29 #include "geryon/nvc_mat.h"
30 #include "geryon/nvc_kernel.h"
31 using namespace ucl_cudart;
32 #else
33 #include "geryon/nvd_timer.h"
34 #include "geryon/nvd_mat.h"
35 #include "geryon/nvd_kernel.h"
36 using namespace ucl_cudadr;
37 #endif
38 
39 #ifdef USE_CUDPP
40 #include "cudpp.h"
41 #endif
42 
43 #include "lal_precision.h"
44 
45 namespace LAMMPS_AL {
46 
47 template <class numtyp, class acctyp>
48 class Atom {
49  public:
50   Atom();
~Atom()51   ~Atom() { clear(); }
52 
53   /// Maximum number of atoms that can be stored with current allocation
max_atoms()54   inline int max_atoms() const { return _max_atoms; }
55   /// Current number of local+ghost atoms stored
nall()56   inline int nall() const { return _nall; }
57 
58   /// Set number of local+ghost atoms for future copy operations
nall(const int n)59   inline void nall(const int n) { _nall=n; }
60 
61   /// Memory usage per atom in this class
62   int bytes_per_atom() const;
63 
64   /// Clear any previous data and set up for a new LAMMPS run
65   /** \param rot True if atom storage needs quaternions
66     * \param gpu_nbor 0 if neighboring will be performed on host
67     *        gpu_nbor 1 if neighboring will be performed on device
68     *        gpu_nbor 2 if binning on host and neighboring on device **/
69   bool init(const int nall, const bool charge, const bool rot,
70             UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false);
71 
72   /// Check if we have enough device storage and realloc if not
73   /** Returns true if resized with any call during this timestep **/
resize(const int nall,bool & success)74   inline bool resize(const int nall, bool &success) {
75     _nall=nall;
76     if (nall>_max_atoms) {
77       clear_resize();
78       success = success && alloc(nall);
79       _resized=true;
80     }
81     return _resized;
82   }
83 
84   /// If already initialized by another LAMMPS style, add fields as necessary
85   /** \param rot True if atom storage needs quaternions
86     * \param gpu_nbor 0 if neighboring will be performed on host
87     *        gpu_nbor 1 if neighboring will be performed on device
88     *        gpu_nbor 2 if binning on host and neighboring on device **/
89   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
90                   const bool bonds);
91 
92   /// Returns true if GPU is using charges
charge()93   bool charge() { return _charge; }
94 
95   /// Returns true if GPU is using quaternions
quaternion()96   bool quaternion() { return _rot; }
97 
98   /// Only free matrices of length inum or nall for resizing
99   void clear_resize();
100 
101   /// Free all memory on host and device
102   void clear();
103 
104   /// Return the total amount of host memory used by class in bytes
105   double host_memory_usage() const;
106 
107   /// Sort arrays for neighbor list calculation on device
108   void sort_neighbor(const int num_atoms);
109 
110   /// Add copy times to timers
acc_timers()111   inline void acc_timers() {
112     time_pos.add_to_total();
113     if (_charge)
114       time_q.add_to_total();
115     if (_rot)
116       time_quat.add_to_total();
117   }
118 
119   /// Add copy times to timers
zero_timers()120   inline void zero_timers() {
121     time_pos.zero();
122     if (_charge)
123       time_q.zero();
124     if (_rot)
125       time_quat.zero();
126   }
127 
128   /// Return the total time for host/device data transfer
129   /** Zeros the total so that the atom times are only included once **/
transfer_time()130   inline double transfer_time() {
131     double total=time_pos.total_seconds();
132     time_pos.zero_total();
133     if (_charge) {
134       total+=time_q.total_seconds();
135       time_q.zero_total();
136     }
137     if (_rot) {
138       total+=time_q.total_seconds();
139       time_quat.zero_total();
140     }
141 
142     return total+_time_transfer/1000.0;
143   }
144 
145   /// Return the total time for data cast/pack
146   /** Zeros the time so that atom times are only included once **/
cast_time()147   inline double cast_time()
148     { double t=_time_cast; _time_cast=0.0; return t; }
149 
150   /// Pack LAMMPS atom type constants into matrix and copy to device
151   template <class dev_typ, class t1>
type_pack1(const int n,const int m_size,UCL_D_Vec<dev_typ> & dev_v,UCL_H_Vec<numtyp> & buffer,t1 ** one)152   inline void type_pack1(const int n, const int m_size,
153                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
154                          t1 **one) {
155     int ii=0;
156     for (int i=0; i<n; i++) {
157       for (int j=0; j<n; j++) {
158         buffer[ii]=static_cast<numtyp>(one[i][j]);
159         ii++;
160       }
161       ii+=m_size-n;
162     }
163     UCL_H_Vec<dev_typ> view;
164     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
165     ucl_copy(dev_v,view,false);
166   }
167 
168   /// Pack LAMMPS atom type constants into 2 vectors and copy to device
169   template <class dev_typ, class t1, class t2>
type_pack2(const int n,const int m_size,UCL_D_Vec<dev_typ> & dev_v,UCL_H_Vec<numtyp> & buffer,t1 ** one,t2 ** two)170   inline void type_pack2(const int n, const int m_size,
171                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
172                          t1 **one, t2 **two) {
173     int ii=0;
174     for (int i=0; i<n; i++) {
175       for (int j=0; j<n; j++) {
176         buffer[ii*2]=static_cast<numtyp>(one[i][j]);
177         buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
178         ii++;
179       }
180       ii+=m_size-n;
181     }
182     UCL_H_Vec<dev_typ> view;
183     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
184     ucl_copy(dev_v,view,false);
185   }
186 
187   /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
188   template <class dev_typ, class t1, class t2, class t3>
type_pack4(const int n,const int m_size,UCL_D_Vec<dev_typ> & dev_v,UCL_H_Vec<numtyp> & buffer,t1 ** one,t2 ** two,t3 ** three)189   inline void type_pack4(const int n, const int m_size,
190                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
191                          t1 **one, t2 **two, t3 **three) {
192     int ii=0;
193     for (int i=0; i<n; i++) {
194       for (int j=0; j<n; j++) {
195         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
196         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
197         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
198         ii++;
199       }
200       ii+=m_size-n;
201     }
202     UCL_H_Vec<dev_typ> view;
203     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
204     ucl_copy(dev_v,view,false);
205   }
206 
207   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
208   template <class dev_typ, class t1, class t2, class t3, class t4>
type_pack4(const int n,const int m_size,UCL_D_Vec<dev_typ> & dev_v,UCL_H_Vec<numtyp> & buffer,t1 ** one,t2 ** two,t3 ** three,t4 ** four)209   inline void type_pack4(const int n, const int m_size,
210                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
211                          t1 **one, t2 **two, t3 **three, t4 **four) {
212     int ii=0;
213     for (int i=0; i<n; i++) {
214       for (int j=0; j<n; j++) {
215         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
216         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
217         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
218         buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
219         ii++;
220       }
221       ii+=m_size-n;
222     }
223     UCL_H_Vec<dev_typ> view;
224     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
225     ucl_copy(dev_v,view,false);
226   }
227 
228   /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
229   template <class dev_typ, class t1, class t2>
self_pack2(const int n,UCL_D_Vec<dev_typ> & dev_v,UCL_H_Vec<numtyp> & buffer,t1 ** one,t2 ** two)230   inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
231                          UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
232     for (int i=0; i<n; i++) {
233       buffer[i*2]=static_cast<numtyp>(one[i][i]);
234       buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
235     }
236     UCL_H_Vec<dev_typ> view;
237     view.view((dev_typ*)buffer.begin(),n,*dev);
238     ucl_copy(dev_v,view,false);
239   }
240 
241   // -------------------------COPY TO GPU ----------------------------------
242 
243   /// Signal that we need to transfer atom data for next timestep
data_unavail()244   inline void data_unavail()
245     { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
246 
247   /// Cast positions and types to write buffer
cast_x_data(double ** host_ptr,const int * host_type)248   inline void cast_x_data(double **host_ptr, const int *host_type) {
249     if (_x_avail==false) {
250       double t=MPI_Wtime();
251       #ifdef GPU_CAST
252       memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
253       memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
254       #else
255       int wl=0;
256       for (int i=0; i<_nall; i++) {
257         x[wl]=host_ptr[i][0];
258         x[wl+1]=host_ptr[i][1];
259         x[wl+2]=host_ptr[i][2];
260         x[wl+3]=host_type[i];
261         wl+=4;
262       }
263       #endif
264       _time_cast+=MPI_Wtime()-t;
265     }
266   }
267 
268   /// Copy positions and types to device asynchronously
269   /** Copies nall() elements **/
add_x_data(double ** host_ptr,int * host_type)270   inline void add_x_data(double **host_ptr, int *host_type) {
271     time_pos.start();
272     if (_x_avail==false) {
273       #ifdef GPU_CAST
274       x_cast.update_device(_nall*3,true);
275       type_cast.update_device(_nall,true);
276       int block_size=64;
277       int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
278       k_cast_x.set_size(GX,block_size);
279       k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
280       #else
281       x.update_device(_nall*4,true);
282       #endif
283       _x_avail=true;
284     }
285     time_pos.stop();
286   }
287 
288   /// Calls cast_x_data and add_x_data and times the routines
cast_copy_x(double ** host_ptr,int * host_type)289   inline void cast_copy_x(double **host_ptr, int *host_type) {
290     cast_x_data(host_ptr,host_type);
291     add_x_data(host_ptr,host_type);
292   }
293 
294   // Cast charges to write buffer
295   template<class cpytyp>
cast_q_data(cpytyp * host_ptr)296   inline void cast_q_data(cpytyp *host_ptr) {
297     if (_q_avail==false) {
298       double t=MPI_Wtime();
299       // If double precision, still memcpy for async transfers
300       if (_host_view) {
301         q.host.view((numtyp*)host_ptr,_nall,*dev);
302         q.device.view(q.host);
303       } else if (sizeof(numtyp)==sizeof(double))
304         memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
305       else
306         for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
307       _time_cast+=MPI_Wtime()-t;
308     }
309   }
310 
311   // Copy charges to device asynchronously
add_q_data()312   inline void add_q_data() {
313     if (_q_avail==false) {
314       q.update_device(_nall,true);
315       _q_avail=true;
316     }
317   }
318 
319   // Cast quaternions to write buffer
320   template<class cpytyp>
cast_quat_data(cpytyp * host_ptr)321   inline void cast_quat_data(cpytyp *host_ptr) {
322     if (_quat_avail==false) {
323       double t=MPI_Wtime();
324       if (_host_view) {
325         quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
326         quat.device.view(quat.host);
327       } else if (sizeof(numtyp)==sizeof(double))
328         memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
329       else
330         for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
331       _time_cast+=MPI_Wtime()-t;
332     }
333   }
334 
335   // Copy quaternions to device
336   /** Copies nall()*4 elements **/
add_quat_data()337   inline void add_quat_data() {
338     if (_quat_avail==false) {
339       quat.update_device(_nall*4,true);
340       _quat_avail=true;
341     }
342   }
343 
344   /// Add in casting time from additional data (seconds)
add_cast_time(double t)345   inline void add_cast_time(double t) { _time_cast+=t; }
346 
347   /// Add in transfer time from additional data (ms)
add_transfer_time(double t)348   inline void add_transfer_time(double t) { _time_transfer+=t; }
349 
350   /// Return number of bytes used on device
max_gpu_bytes()351   inline double max_gpu_bytes()
352     { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
353 
354   /// Returns true if the device is addressing memory on the host
host_view()355   inline bool host_view() { return _host_view; }
356 
357   // ------------------------------ DATA ----------------------------------
358 
359   /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
360   UCL_Vector<numtyp,numtyp> x;
361   /// Charges
362   UCL_Vector<numtyp,numtyp> q;
363   /// Quaterions
364   UCL_Vector<numtyp,numtyp> quat;
365 
366   #ifdef GPU_CAST
367   UCL_Vector<double,double> x_cast;
368   UCL_Vector<int,int> type_cast;
369   #endif
370 
371   /// Cell list identifiers for device nbor builds
372   UCL_D_Vec<unsigned> dev_cell_id;
373   /// Cell list identifiers for device nbor builds
374   UCL_D_Vec<int> dev_particle_id;
375   /// Atom tag information for device nbor builds
376   UCL_D_Vec<int> dev_tag;
377 
378   /// Cell list identifiers for hybrid nbor builds
379   UCL_H_Vec<int> host_cell_id;
380   /// Cell list identifiers for hybrid nbor builds
381   UCL_H_Vec<int> host_particle_id;
382 
383   /// Device timers
384   UCL_Timer time_pos, time_q, time_quat;
385 
386   /// Geryon device
387   UCL_Device *dev;
388 
389  private:
390   #ifdef GPU_CAST
391   UCL_Program *atom_program;
392   UCL_Kernel k_cast_x;
393   void compile_kernels(UCL_Device &dev);
394   #endif
395 
396   bool _compiled;
397 
398   // True if data has been copied to device already
399   bool _x_avail, _q_avail, _quat_avail, _resized;
400 
401   bool alloc(const int nall);
402 
403   bool _allocated, _rot, _charge, _bonds, _other;
404   int _max_atoms, _nall, _gpu_nbor;
405   bool _host_view;
406   double _time_cast, _time_transfer;
407 
408   double _max_gpu_bytes;
409 
410   #ifdef USE_CUDPP
411   CUDPPConfiguration sort_config;
412   CUDPPHandle sort_plan;
413   #endif
414 };
415 
416 }
417 
418 #endif
419 
420