1 /***************************************************************************
2                               yukawa_colloid.cpp
3                              -------------------
4                             Trung Dac Nguyen (ORNL)
5 
6   Class for acceleration of the yukawa/colloid pair style.
7 
8  __________________________________________________________________________
9     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10  __________________________________________________________________________
11 
12     begin                :
13     email                : nguyentd@ornl.gov
14  ***************************************************************************/
15 
16 #ifdef USE_OPENCL
17 #include "yukawa_colloid_cl.h"
18 #elif defined(USE_CUDART)
19 const char *yukawa_colloid=0;
20 #else
21 #include "yukawa_colloid_cubin.h"
22 #endif
23 
24 #include "lal_yukawa_colloid.h"
25 #include <cassert>
26 namespace LAMMPS_AL {
27 #define YukawaColloidT YukawaColloid<numtyp, acctyp>
28 
29 extern Device<PRECISION,ACC_PRECISION> device;
30 
31 template <class numtyp, class acctyp>
YukawaColloid()32 YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
33 _max_rad_size(0), _allocated(false) {
34 }
35 
36 template <class numtyp, class acctyp>
~YukawaColloid()37 YukawaColloidT::~YukawaColloid() {
38   clear();
39 }
40 
41 template <class numtyp, class acctyp>
bytes_per_atom(const int max_nbors) const42 int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
43   return this->bytes_per_atom_atomic(max_nbors);
44 }
45 
46 template <class numtyp, class acctyp>
init(const int ntypes,double ** host_cutsq,double ** host_a,double ** host_offset,double * host_special_lj,const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen,const double kappa)47 int YukawaColloidT::init(const int ntypes,
48                    double **host_cutsq, double **host_a,
49                    double **host_offset, double *host_special_lj, const int nlocal,
50                    const int nall, const int max_nbors,
51                    const int maxspecial, const double cell_size,
52                    const double gpu_split, FILE *_screen, const double kappa) {
53   int success;
54   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
55                             _screen,yukawa_colloid,"k_yukawa_colloid");
56   if (success!=0)
57     return success;
58 
59   if (this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double))
60     _shared_view=true;
61   else
62     _shared_view=false;
63 
64   // allocate rad
65 
66   int ef_nall=nall;
67   if (ef_nall==0)
68     ef_nall=2000;
69 
70   _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
71 
72   if (_shared_view==false)
73     c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
74 
75   rad_tex.get_texture(*(this->pair_program),"rad_tex");
76   rad_tex.bind_float(c_rad,1);
77 
78   // If atom type constants fit in shared memory use fast kernel
79   int lj_types=ntypes;
80   shared_types=false;
81   int max_shared_types=this->device->max_shared_types();
82   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
83     lj_types=max_shared_types;
84     shared_types=true;
85   }
86   _lj_types=lj_types;
87 
88   _kappa = kappa;
89 
90   // Allocate a host write buffer for data initialization
91   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
92                                UCL_WRITE_ONLY);
93 
94   for (int i=0; i<lj_types*lj_types*32; i++)
95     host_write[i]=(numtyp)0.0;
96 
97   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
98   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
99                          host_offset,host_cutsq);
100 
101   UCL_H_Vec<double> dview;
102   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
103   dview.view(host_special_lj,4,*(this->ucl_device));
104   ucl_copy(sp_lj,dview,false);
105 
106   _allocated=true;
107   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
108   return 0;
109 }
110 
111 template <class numtyp, class acctyp>
clear()112 void YukawaColloidT::clear() {
113   if (!_allocated)
114     return;
115   _allocated=false;
116 
117   coeff.clear();
118   sp_lj.clear();
119 
120   c_rad.clear();
121 
122   this->clear_atomic();
123 }
124 
125 template <class numtyp, class acctyp>
host_memory_usage() const126 double YukawaColloidT::host_memory_usage() const {
127   return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
128 }
129 
130 // ---------------------------------------------------------------------------
131 // Copy nbor list from host if necessary and then compute atom energies/forces
132 // ---------------------------------------------------------------------------
133 template <class numtyp, class acctyp>
compute(const int f_ago,const int inum_full,const int nall,double ** host_x,int * host_type,int * ilist,int * numj,int ** firstneigh,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,const double cpu_time,bool & success,double * rad)134 void YukawaColloidT::compute(const int f_ago, const int inum_full,
135                const int nall, double **host_x, int *host_type, int *ilist,
136                int *numj, int **firstneigh, const bool eflag_in,
137                const bool vflag_in, const bool eatom, const bool vatom,
138                int &host_start, const double cpu_time, bool &success,
139                double *rad) {
140   this->acc_timers();
141   int eflag, vflag;
142   if (eatom) eflag=2;
143   else if (eflag_in) eflag=1;
144   else eflag=0;
145   if (vatom) vflag=2;
146   else if (vflag_in) vflag=1;
147   else vflag=0;
148 
149   #ifdef LAL_NO_BLOCK_REDUCE
150   if (eflag) eflag=2;
151   if (vflag) vflag=2;
152   #endif
153 
154   this->set_kernel(eflag,vflag);
155 
156   // ------------------- Resize rad array --------------------------
157 
158   if (nall>_max_rad_size) {
159     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
160     if (_shared_view==false) {
161       c_rad.resize(_max_rad_size);
162       rad_tex.bind_float(c_rad,1);
163     }
164   }
165 
166   // ----------------------------------------------------------------
167 
168   if (inum_full==0) {
169     host_start=0;
170     // Make sure textures are correct if realloc by a different hybrid style
171     this->resize_atom(0,nall,success);
172     this->zero_timers();
173     return;
174   }
175 
176   int ago=this->hd_balancer.ago_first(f_ago);
177   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
178   this->ans->inum(inum);
179   host_start=inum;
180 
181   // -----------------------------------------------------------------
182 
183   if (ago==0) {
184     this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
185     if (!success)
186       return;
187   }
188 
189   this->atom->cast_x_data(host_x,host_type);
190   this->cast_rad_data(rad);
191   this->hd_balancer.start_timer();
192   this->atom->add_x_data(host_x,host_type);
193   this->add_rad_data();
194 
195   const int red_blocks=this->loop(eflag,vflag);
196   this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
197   this->device->add_ans_object(this->ans);
198   this->hd_balancer.stop_timer();
199 }
200 
201 // ---------------------------------------------------------------------------
202 // Reneighbor on GPU and then compute per-atom densities
203 // ---------------------------------------------------------------------------
204 template <class numtyp, class acctyp>
compute(const int ago,const int inum_full,const int nall,double ** host_x,int * host_type,double * sublo,double * subhi,tagint * tag,int ** nspecial,tagint ** special,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,int ** ilist,int ** jnum,const double cpu_time,bool & success,double * rad)205 int** YukawaColloidT::compute(const int ago, const int inum_full,
206                 const int nall, double **host_x, int *host_type, double *sublo,
207                 double *subhi, tagint *tag, int **nspecial,
208                 tagint **special, const bool eflag_in, const bool vflag_in,
209                 const bool eatom, const bool vatom, int &host_start,
210                 int **ilist, int **jnum, const double cpu_time, bool &success,
211                 double *rad) {
212   this->acc_timers();
213   int eflag, vflag;
214   if (eatom) eflag=2;
215   else if (eflag_in) eflag=1;
216   else eflag=0;
217   if (vatom) vflag=2;
218   else if (vflag_in) vflag=1;
219   else vflag=0;
220 
221   #ifdef LAL_NO_BLOCK_REDUCE
222   if (eflag) eflag=2;
223   if (vflag) vflag=2;
224   #endif
225 
226   this->set_kernel(eflag,vflag);
227 
228   // ------------------- Resize rad array ----------------------------
229 
230   if (nall>_max_rad_size) {
231     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
232     if (_shared_view==false) {
233       c_rad.resize(_max_rad_size);
234       rad_tex.bind_float(c_rad,1);
235     }
236   }
237 
238   // -----------------------------------------------------------------
239 
240   if (inum_full==0) {
241     host_start=0;
242     // Make sure textures are correct if realloc by a different hybrid style
243     this->resize_atom(0,nall,success);
244     this->zero_timers();
245     return nullptr;
246   }
247 
248   // load balance, returning the atom count on the device (inum)
249   this->hd_balancer.balance(cpu_time);
250   int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
251   this->ans->inum(inum);
252   host_start=inum;
253 
254   // Build neighbor list on GPU if necessary
255   if (ago==0) {
256     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
257                           sublo, subhi, tag, nspecial, special, success);
258     if (!success)
259       return nullptr;
260     this->cast_rad_data(rad);
261     this->hd_balancer.start_timer();
262   } else {
263     this->atom->cast_x_data(host_x,host_type);
264     this->cast_rad_data(rad);
265     this->hd_balancer.start_timer();
266     this->atom->add_x_data(host_x,host_type);
267   }
268   this->add_rad_data();
269   *ilist=this->nbor->host_ilist.begin();
270   *jnum=this->nbor->host_acc.begin();
271 
272   const int red_blocks=this->loop(eflag,vflag);
273   this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
274   this->device->add_ans_object(this->ans);
275   this->hd_balancer.stop_timer();
276 
277   return this->nbor->host_jlist.begin()-host_start;
278 }
279 
280 // ---------------------------------------------------------------------------
281 // Calculate per-atom energies and forces
282 // ---------------------------------------------------------------------------
283 template <class numtyp, class acctyp>
loop(const int eflag,const int vflag)284 int YukawaColloidT::loop(const int eflag, const int vflag) {
285   // Compute the block size and grid size to keep all cores busy
286   const int BX=this->block_size();
287   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
288                                (BX/this->_threads_per_atom)));
289 
290   int ainum=this->ans->inum();
291   int nbor_pitch=this->nbor->nbor_pitch();
292   this->time_pair.start();
293   if (shared_types) {
294     this->k_pair_sel->set_size(GX,BX);
295     this->k_pair_sel->run(&this->atom->x, &c_rad, &coeff, &sp_lj,
296                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
297                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
298                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
299   } else {
300     this->k_pair.set_size(GX,BX);
301     this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj,
302                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
303                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
304                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
305   }
306   this->time_pair.stop();
307   return GX;
308 }
309 
310 template class YukawaColloid<PRECISION,ACC_PRECISION>;
311 }
312