1 /***************************************************************************
2 yukawa_colloid.cpp
3 -------------------
4 Trung Dac Nguyen (ORNL)
5
6 Class for acceleration of the yukawa/colloid pair style.
7
8 __________________________________________________________________________
9 This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 __________________________________________________________________________
11
12 begin :
13 email : nguyentd@ornl.gov
14 ***************************************************************************/
15
16 #ifdef USE_OPENCL
17 #include "yukawa_colloid_cl.h"
18 #elif defined(USE_CUDART)
19 const char *yukawa_colloid=0;
20 #else
21 #include "yukawa_colloid_cubin.h"
22 #endif
23
24 #include "lal_yukawa_colloid.h"
25 #include <cassert>
26 namespace LAMMPS_AL {
27 #define YukawaColloidT YukawaColloid<numtyp, acctyp>
28
29 extern Device<PRECISION,ACC_PRECISION> device;
30
31 template <class numtyp, class acctyp>
YukawaColloid()32 YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
33 _max_rad_size(0), _allocated(false) {
34 }
35
36 template <class numtyp, class acctyp>
~YukawaColloid()37 YukawaColloidT::~YukawaColloid() {
38 clear();
39 }
40
41 template <class numtyp, class acctyp>
bytes_per_atom(const int max_nbors) const42 int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
43 return this->bytes_per_atom_atomic(max_nbors);
44 }
45
46 template <class numtyp, class acctyp>
init(const int ntypes,double ** host_cutsq,double ** host_a,double ** host_offset,double * host_special_lj,const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen,const double kappa)47 int YukawaColloidT::init(const int ntypes,
48 double **host_cutsq, double **host_a,
49 double **host_offset, double *host_special_lj, const int nlocal,
50 const int nall, const int max_nbors,
51 const int maxspecial, const double cell_size,
52 const double gpu_split, FILE *_screen, const double kappa) {
53 int success;
54 success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
55 _screen,yukawa_colloid,"k_yukawa_colloid");
56 if (success!=0)
57 return success;
58
59 if (this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double))
60 _shared_view=true;
61 else
62 _shared_view=false;
63
64 // allocate rad
65
66 int ef_nall=nall;
67 if (ef_nall==0)
68 ef_nall=2000;
69
70 _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
71
72 if (_shared_view==false)
73 c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
74
75 rad_tex.get_texture(*(this->pair_program),"rad_tex");
76 rad_tex.bind_float(c_rad,1);
77
78 // If atom type constants fit in shared memory use fast kernel
79 int lj_types=ntypes;
80 shared_types=false;
81 int max_shared_types=this->device->max_shared_types();
82 if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
83 lj_types=max_shared_types;
84 shared_types=true;
85 }
86 _lj_types=lj_types;
87
88 _kappa = kappa;
89
90 // Allocate a host write buffer for data initialization
91 UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
92 UCL_WRITE_ONLY);
93
94 for (int i=0; i<lj_types*lj_types*32; i++)
95 host_write[i]=(numtyp)0.0;
96
97 coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
98 this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
99 host_offset,host_cutsq);
100
101 UCL_H_Vec<double> dview;
102 sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
103 dview.view(host_special_lj,4,*(this->ucl_device));
104 ucl_copy(sp_lj,dview,false);
105
106 _allocated=true;
107 this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
108 return 0;
109 }
110
111 template <class numtyp, class acctyp>
clear()112 void YukawaColloidT::clear() {
113 if (!_allocated)
114 return;
115 _allocated=false;
116
117 coeff.clear();
118 sp_lj.clear();
119
120 c_rad.clear();
121
122 this->clear_atomic();
123 }
124
125 template <class numtyp, class acctyp>
host_memory_usage() const126 double YukawaColloidT::host_memory_usage() const {
127 return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
128 }
129
130 // ---------------------------------------------------------------------------
131 // Copy nbor list from host if necessary and then compute atom energies/forces
132 // ---------------------------------------------------------------------------
133 template <class numtyp, class acctyp>
compute(const int f_ago,const int inum_full,const int nall,double ** host_x,int * host_type,int * ilist,int * numj,int ** firstneigh,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,const double cpu_time,bool & success,double * rad)134 void YukawaColloidT::compute(const int f_ago, const int inum_full,
135 const int nall, double **host_x, int *host_type, int *ilist,
136 int *numj, int **firstneigh, const bool eflag_in,
137 const bool vflag_in, const bool eatom, const bool vatom,
138 int &host_start, const double cpu_time, bool &success,
139 double *rad) {
140 this->acc_timers();
141 int eflag, vflag;
142 if (eatom) eflag=2;
143 else if (eflag_in) eflag=1;
144 else eflag=0;
145 if (vatom) vflag=2;
146 else if (vflag_in) vflag=1;
147 else vflag=0;
148
149 #ifdef LAL_NO_BLOCK_REDUCE
150 if (eflag) eflag=2;
151 if (vflag) vflag=2;
152 #endif
153
154 this->set_kernel(eflag,vflag);
155
156 // ------------------- Resize rad array --------------------------
157
158 if (nall>_max_rad_size) {
159 _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
160 if (_shared_view==false) {
161 c_rad.resize(_max_rad_size);
162 rad_tex.bind_float(c_rad,1);
163 }
164 }
165
166 // ----------------------------------------------------------------
167
168 if (inum_full==0) {
169 host_start=0;
170 // Make sure textures are correct if realloc by a different hybrid style
171 this->resize_atom(0,nall,success);
172 this->zero_timers();
173 return;
174 }
175
176 int ago=this->hd_balancer.ago_first(f_ago);
177 int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
178 this->ans->inum(inum);
179 host_start=inum;
180
181 // -----------------------------------------------------------------
182
183 if (ago==0) {
184 this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
185 if (!success)
186 return;
187 }
188
189 this->atom->cast_x_data(host_x,host_type);
190 this->cast_rad_data(rad);
191 this->hd_balancer.start_timer();
192 this->atom->add_x_data(host_x,host_type);
193 this->add_rad_data();
194
195 const int red_blocks=this->loop(eflag,vflag);
196 this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
197 this->device->add_ans_object(this->ans);
198 this->hd_balancer.stop_timer();
199 }
200
201 // ---------------------------------------------------------------------------
202 // Reneighbor on GPU and then compute per-atom densities
203 // ---------------------------------------------------------------------------
204 template <class numtyp, class acctyp>
compute(const int ago,const int inum_full,const int nall,double ** host_x,int * host_type,double * sublo,double * subhi,tagint * tag,int ** nspecial,tagint ** special,const bool eflag_in,const bool vflag_in,const bool eatom,const bool vatom,int & host_start,int ** ilist,int ** jnum,const double cpu_time,bool & success,double * rad)205 int** YukawaColloidT::compute(const int ago, const int inum_full,
206 const int nall, double **host_x, int *host_type, double *sublo,
207 double *subhi, tagint *tag, int **nspecial,
208 tagint **special, const bool eflag_in, const bool vflag_in,
209 const bool eatom, const bool vatom, int &host_start,
210 int **ilist, int **jnum, const double cpu_time, bool &success,
211 double *rad) {
212 this->acc_timers();
213 int eflag, vflag;
214 if (eatom) eflag=2;
215 else if (eflag_in) eflag=1;
216 else eflag=0;
217 if (vatom) vflag=2;
218 else if (vflag_in) vflag=1;
219 else vflag=0;
220
221 #ifdef LAL_NO_BLOCK_REDUCE
222 if (eflag) eflag=2;
223 if (vflag) vflag=2;
224 #endif
225
226 this->set_kernel(eflag,vflag);
227
228 // ------------------- Resize rad array ----------------------------
229
230 if (nall>_max_rad_size) {
231 _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
232 if (_shared_view==false) {
233 c_rad.resize(_max_rad_size);
234 rad_tex.bind_float(c_rad,1);
235 }
236 }
237
238 // -----------------------------------------------------------------
239
240 if (inum_full==0) {
241 host_start=0;
242 // Make sure textures are correct if realloc by a different hybrid style
243 this->resize_atom(0,nall,success);
244 this->zero_timers();
245 return nullptr;
246 }
247
248 // load balance, returning the atom count on the device (inum)
249 this->hd_balancer.balance(cpu_time);
250 int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
251 this->ans->inum(inum);
252 host_start=inum;
253
254 // Build neighbor list on GPU if necessary
255 if (ago==0) {
256 this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
257 sublo, subhi, tag, nspecial, special, success);
258 if (!success)
259 return nullptr;
260 this->cast_rad_data(rad);
261 this->hd_balancer.start_timer();
262 } else {
263 this->atom->cast_x_data(host_x,host_type);
264 this->cast_rad_data(rad);
265 this->hd_balancer.start_timer();
266 this->atom->add_x_data(host_x,host_type);
267 }
268 this->add_rad_data();
269 *ilist=this->nbor->host_ilist.begin();
270 *jnum=this->nbor->host_acc.begin();
271
272 const int red_blocks=this->loop(eflag,vflag);
273 this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
274 this->device->add_ans_object(this->ans);
275 this->hd_balancer.stop_timer();
276
277 return this->nbor->host_jlist.begin()-host_start;
278 }
279
280 // ---------------------------------------------------------------------------
281 // Calculate per-atom energies and forces
282 // ---------------------------------------------------------------------------
283 template <class numtyp, class acctyp>
loop(const int eflag,const int vflag)284 int YukawaColloidT::loop(const int eflag, const int vflag) {
285 // Compute the block size and grid size to keep all cores busy
286 const int BX=this->block_size();
287 int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
288 (BX/this->_threads_per_atom)));
289
290 int ainum=this->ans->inum();
291 int nbor_pitch=this->nbor->nbor_pitch();
292 this->time_pair.start();
293 if (shared_types) {
294 this->k_pair_sel->set_size(GX,BX);
295 this->k_pair_sel->run(&this->atom->x, &c_rad, &coeff, &sp_lj,
296 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
297 &this->ans->force, &this->ans->engv, &eflag, &vflag,
298 &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
299 } else {
300 this->k_pair.set_size(GX,BX);
301 this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj,
302 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
303 &this->ans->force, &this->ans->engv, &eflag, &vflag,
304 &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
305 }
306 this->time_pair.stop();
307 return GX;
308 }
309
310 template class YukawaColloid<PRECISION,ACC_PRECISION>;
311 }
312