1 /***************************************************************************
2                                base_dipole.cpp
3                              -------------------
4                             Trung Dac Nguyen (ORNL)
5 
6   Base class for pair styles needing per-particle data for position,
7   dipole, and type.
8 
9  __________________________________________________________________________
10     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
11  __________________________________________________________________________
12 
13     begin                :
14     email                : nguyentd@ornl.gov
15  ***************************************************************************/
16 
17 #include "lal_base_dipole.h"
18 namespace LAMMPS_AL {
19 #define BaseDipoleT BaseDipole<numtyp, acctyp>
20 
21 extern Device<PRECISION,ACC_PRECISION> global_device;
22 
23 template <class numtyp, class acctyp>
BaseDipole()24 BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
25   device=&global_device;
26   ans=new Answer<numtyp,acctyp>();
27   nbor=new Neighbor();
28   pair_program=nullptr;
29   ucl_device=nullptr;
30   #if defined(LAL_OCL_EV_JIT)
31   pair_program_noev=nullptr;
32   #endif
33 }
34 
35 template <class numtyp, class acctyp>
~BaseDipole()36 BaseDipoleT::~BaseDipole() {
37   delete ans;
38   delete nbor;
39   k_pair_fast.clear();
40   k_pair.clear();
41   if (pair_program) delete pair_program;
42   #if defined(LAL_OCL_EV_JIT)
43   k_pair_noev.clear();
44   if (pair_program_noev) delete pair_program_noev;
45   #endif
46 }
47 
48 template <class numtyp, class acctyp>
bytes_per_atom_atomic(const int max_nbors) const49 int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const {
50   return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
51          nbor->bytes_per_atom(max_nbors);
52 }
53 
54 template <class numtyp, class acctyp>
init_atomic(const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen,const void * pair_program,const char * k_name)55 int BaseDipoleT::init_atomic(const int nlocal, const int nall,
56                              const int max_nbors, const int maxspecial,
57                              const double cell_size,
58                              const double gpu_split, FILE *_screen,
59                              const void *pair_program,
60                              const char *k_name) {
61   screen=_screen;
62 
63   int gpu_nbor=0;
64   if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
65     gpu_nbor=1;
66   else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
67     gpu_nbor=2;
68 
69   int _gpu_host=0;
70   int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
71   if (host_nlocal>0)
72     _gpu_host=1;
73 
74   _threads_per_atom=device->threads_per_charge();
75 
76   int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
77   if (success!=0)
78     return success;
79 
80   if (ucl_device!=device->gpu) _compiled=false;
81 
82   ucl_device=device->gpu;
83   atom=&device->atom;
84 
85   _block_size=device->pair_block_size();
86   compile_kernels(*ucl_device,pair_program,k_name);
87 
88   if (_threads_per_atom>1 && gpu_nbor==0) {
89     nbor->packing(true);
90     _nbor_data=&(nbor->dev_packed);
91   } else
92     _nbor_data=&(nbor->dev_nbor);
93 
94   success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
95                   max_nbors,cell_size,false,_threads_per_atom);
96   if (success!=0)
97     return success;
98 
99   // Initialize host-device load balancer
100   hd_balancer.init(device,gpu_nbor,gpu_split);
101 
102   // Initialize timers for the selected GPU
103   time_pair.init(*ucl_device);
104   time_pair.zero();
105 
106   pos_tex.bind_float(atom->x,4);
107   q_tex.bind_float(atom->q,1);
108   mu_tex.bind_float(atom->quat,4);
109 
110   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
111 
112   return success;
113 }
114 
115 template <class numtyp, class acctyp>
estimate_gpu_overhead()116 void BaseDipoleT::estimate_gpu_overhead() {
117   device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
118 }
119 
120 template <class numtyp, class acctyp>
clear_atomic()121 void BaseDipoleT::clear_atomic() {
122   // Output any timing information
123   acc_timers();
124   double avg_split=hd_balancer.all_avg_split();
125   _gpu_overhead*=hd_balancer.timestep();
126   _driver_overhead*=hd_balancer.timestep();
127   device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
128                        _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
129 
130   time_pair.clear();
131   hd_balancer.clear();
132 
133   nbor->clear();
134   ans->clear();
135 }
136 
137 // ---------------------------------------------------------------------------
138 // Copy neighbor list from host
139 // ---------------------------------------------------------------------------
140 template <class numtyp, class acctyp>
reset_nbors(const int nall,const int inum,int * ilist,int * numj,int ** firstneigh,bool & success)141 int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
142                                    int *numj, int **firstneigh, bool &success) {
143   success=true;
144 
145   int mn=nbor->max_nbor_loop(inum,numj,ilist);
146   resize_atom(inum,nall,success);
147   resize_local(inum,mn,success);
148   if (!success)
149     return nullptr;
150 
151   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
152 
153   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
154   if (bytes>_max_an_bytes)
155     _max_an_bytes=bytes;
156 
157   return ilist;
158 }
159 
160 // ---------------------------------------------------------------------------
161 // Build neighbor list on device
162 // ---------------------------------------------------------------------------
163 template <class numtyp, class acctyp>
build_nbor_list(const int inum,const int host_inum,const int nall,double ** host_x,int * host_type,double * sublo,double * subhi,tagint * tag,int ** nspecial,tagint ** special,bool & success)164 inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
165                                          const int nall, double **host_x,
166                                          int *host_type, double *sublo,
167                                          double *subhi, tagint *tag,
168                                          int **nspecial, tagint **special,
169                                          bool &success) {
170   success=true;
171   resize_atom(inum,nall,success);
172   resize_local(inum,host_inum,nbor->max_nbors(),success);
173   if (!success)
174     return;
175   atom->cast_copy_x(host_x,host_type);
176 
177   int mn;
178   nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
179                         tag, nspecial, special, success, mn, ans->error_flag);
180 
181   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
182   if (bytes>_max_an_bytes)
183     _max_an_bytes=bytes;
184 }
185 
186 // ---------------------------------------------------------------------------
187 // Copy nbor list from host if necessary and then calculate forces, virials,..
188 // ---------------------------------------------------------------------------
189 template <class numtyp, class acctyp>
compute(const int f_ago,const int inum_full,const int nall,double ** host_x,int * host_type,int * ilist,int * numj,int ** firstneigh,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,const double cpu_time,bool & success,double * host_q,double ** host_mu,const int nlocal,double * boxlo,double * prd)190 void BaseDipoleT::compute(const int f_ago, const int inum_full,
191                           const int nall, double **host_x, int *host_type,
192                           int *ilist, int *numj, int **firstneigh,
193                           const bool eflag_in, const bool vflag_in,
194                           const bool eatom, const bool vatom,
195                           int &host_start, const double cpu_time,
196                           bool &success, double *host_q, double **host_mu,
197                           const int nlocal, double *boxlo, double *prd) {
198   acc_timers();
199   int eflag, vflag;
200   if (eatom) eflag=2;
201   else if (eflag_in) eflag=1;
202   else eflag=0;
203   if (vatom) vflag=2;
204   else if (vflag_in) vflag=1;
205   else vflag=0;
206 
207   #ifdef LAL_NO_BLOCK_REDUCE
208   if (eflag) eflag=2;
209   if (vflag) vflag=2;
210   #endif
211 
212   set_kernel(eflag,vflag);
213   if (inum_full==0) {
214     host_start=0;
215     // Make sure textures are correct if realloc by a different hybrid style
216     resize_atom(0,nall,success);
217     zero_timers();
218     return;
219   }
220 
221   int ago=hd_balancer.ago_first(f_ago);
222   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
223   ans->inum(inum);
224   host_start=inum;
225 
226   if (ago==0) {
227     reset_nbors(nall, inum, ilist, numj, firstneigh, success);
228     if (!success)
229       return;
230   }
231 
232   atom->cast_x_data(host_x,host_type);
233   atom->cast_q_data(host_q);
234   atom->cast_quat_data(host_mu[0]);
235   hd_balancer.start_timer();
236   atom->add_x_data(host_x,host_type);
237   atom->add_q_data();
238   atom->add_quat_data();
239 
240   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
241                      boxlo, prd);
242 
243   const int red_blocks=loop(eflag,vflag);
244   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
245   device->add_ans_object(ans);
246   hd_balancer.stop_timer();
247 }
248 
249 // ---------------------------------------------------------------------------
250 // Reneighbor on GPU if necessary and then compute forces, virials, energies
251 // ---------------------------------------------------------------------------
252 template <class numtyp, class acctyp>
compute(const int ago,const int inum_full,const int nall,double ** host_x,int * host_type,double * sublo,double * subhi,tagint * tag,int ** nspecial,tagint ** special,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,int ** ilist,int ** jnum,const double cpu_time,bool & success,double * host_q,double ** host_mu,double * boxlo,double * prd)253 int** BaseDipoleT::compute(const int ago, const int inum_full,
254                            const int nall, double **host_x, int *host_type,
255                            double *sublo, double *subhi, tagint *tag,
256                            int **nspecial, tagint **special,
257                            const bool eflag_in, const bool vflag_in,
258                            const bool eatom, const bool vatom,
259                            int &host_start, int **ilist, int **jnum,
260                            const double cpu_time, bool &success,
261                            double *host_q, double **host_mu,
262                            double *boxlo, double *prd) {
263   acc_timers();
264   int eflag, vflag;
265   if (eatom) eflag=2;
266   else if (eflag_in) eflag=1;
267   else eflag=0;
268   if (vatom) vflag=2;
269   else if (vflag_in) vflag=1;
270   else vflag=0;
271 
272   #ifdef LAL_NO_BLOCK_REDUCE
273   if (eflag) eflag=2;
274   if (vflag) vflag=2;
275   #endif
276 
277   set_kernel(eflag,vflag);
278   if (inum_full==0) {
279     host_start=0;
280     // Make sure textures are correct if realloc by a different hybrid style
281     resize_atom(0,nall,success);
282     zero_timers();
283     return nullptr;
284   }
285 
286   hd_balancer.balance(cpu_time);
287   int inum=hd_balancer.get_gpu_count(ago,inum_full);
288   ans->inum(inum);
289   host_start=inum;
290 
291   // Build neighbor list on GPU if necessary
292   if (ago==0) {
293     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
294                     sublo, subhi, tag, nspecial, special, success);
295     if (!success)
296       return nullptr;
297     atom->cast_q_data(host_q);
298     atom->cast_quat_data(host_mu[0]);
299     hd_balancer.start_timer();
300   } else {
301     atom->cast_x_data(host_x,host_type);
302     atom->cast_q_data(host_q);
303     atom->cast_quat_data(host_mu[0]);
304     hd_balancer.start_timer();
305     atom->add_x_data(host_x,host_type);
306   }
307   atom->add_q_data();
308   atom->add_quat_data();
309   *ilist=nbor->host_ilist.begin();
310   *jnum=nbor->host_acc.begin();
311 
312   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
313                      boxlo, prd);
314 
315   const int red_blocks=loop(eflag,vflag);
316   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
317   device->add_ans_object(ans);
318   hd_balancer.stop_timer();
319 
320   return nbor->host_jlist.begin()-host_start;
321 }
322 
323 template <class numtyp, class acctyp>
host_memory_usage_atomic() const324 double BaseDipoleT::host_memory_usage_atomic() const {
325   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
326          4*sizeof(numtyp)+sizeof(BaseDipole<numtyp,acctyp>);
327 }
328 
329 template <class numtyp, class acctyp>
compile_kernels(UCL_Device & dev,const void * pair_str,const char * kname)330 void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
331                                   const char *kname) {
332   if (_compiled)
333     return;
334 
335   std::string s_fast=std::string(kname)+"_fast";
336   if (pair_program) delete pair_program;
337   pair_program=new UCL_Program(dev);
338   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
339   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
340   k_pair_fast.set_function(*pair_program,s_fast.c_str());
341   k_pair.set_function(*pair_program,kname);
342   pos_tex.get_texture(*pair_program,"pos_tex");
343   q_tex.get_texture(*pair_program,"q_tex");
344   mu_tex.get_texture(*pair_program,"mu_tex");
345 
346   #if defined(LAL_OCL_EV_JIT)
347   oclstring = device->compile_string()+" -DEVFLAG=0";
348   if (pair_program_noev) delete pair_program_noev;
349   pair_program_noev=new UCL_Program(dev);
350   pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
351   k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
352   #else
353   k_pair_sel = &k_pair_fast;
354   #endif
355 
356   _compiled=true;
357 
358   #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
359   if (dev.has_subgroup_support()) {
360     size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
361     #if defined(LAL_OCL_EV_JIT)
362     mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
363     #endif
364     if (_threads_per_atom > mx_subgroup_sz)
365       _threads_per_atom = mx_subgroup_sz;
366     device->set_simd_size(mx_subgroup_sz);
367   }
368   #endif
369 
370 }
371 
372 template class BaseDipole<PRECISION,ACC_PRECISION>;
373 }
374