1 /***************************************************************************
2 re_squared.cpp
3 -------------------
4 W. Michael Brown
5
6 Host code for RE-Squared potential acceleration
7
8 __________________________________________________________________________
9 This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 __________________________________________________________________________
11
12 begin : Fri May 06 2011
13 email : brownw@ornl.gov
14 ***************************************************************************/
15
16 #if defined(USE_OPENCL)
17 #include "re_squared_cl.h"
18 #include "re_squared_lj_cl.h"
19 #elif defined(USE_CUDART)
20 const char *re_squared=0;
21 const char *re_squared_lj=0;
22 #else
23 #include "re_squared_cubin.h"
24 #include "re_squared_lj_cubin.h"
25 #endif
26
27 #include "lal_re_squared.h"
28 #include <cassert>
29 using namespace LAMMPS_AL;
30
31 #define RESquaredT RESquared<numtyp, acctyp>
32 extern Device<PRECISION,ACC_PRECISION> device;
33
34 template <class numtyp, class acctyp>
RESquared()35 RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(),
36 _allocated(false) {
37 }
38
39 template <class numtyp, class acctyp>
~RESquared()40 RESquaredT::~RESquared() {
41 clear();
42 }
43
44 template <class numtyp, class acctyp>
bytes_per_atom(const int max_nbors) const45 int RESquaredT::bytes_per_atom(const int max_nbors) const {
46 return this->bytes_per_atom(max_nbors);
47 }
48
49 template <class numtyp, class acctyp>
init(const int ntypes,double ** host_shape,double ** host_well,double ** host_cutsq,double ** host_sigma,double ** host_epsilon,int ** h_form,double ** host_lj1,double ** host_lj2,double ** host_lj3,double ** host_lj4,double ** host_offset,const double * host_special_lj,const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen)50 int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
51 double **host_cutsq, double **host_sigma,
52 double **host_epsilon, int **h_form, double **host_lj1,
53 double **host_lj2, double **host_lj3, double **host_lj4,
54 double **host_offset, const double *host_special_lj,
55 const int nlocal, const int nall, const int max_nbors,
56 const int maxspecial, const double cell_size,
57 const double gpu_split, FILE *_screen) {
58 int success;
59 success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
60 _screen,ntypes,h_form,re_squared,re_squared_lj,
61 "k_resquared",true);
62 if (success!=0)
63 return success;
64
65 // If atom type constants fit in shared memory use fast kernel
66 int lj_types=ntypes;
67 _shared_types=false;
68 int max_shared_types=this->device->max_shared_types();
69 if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
70 lj_types=max_shared_types;
71 _shared_types=true;
72 }
73 _lj_types=lj_types;
74
75 // Allocate a host write buffer for copying type data
76 UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
77 UCL_WRITE_ONLY);
78
79 for (int i=0; i<lj_types*lj_types; i++)
80 host_write[i]=0.0;
81
82 sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
83 this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
84 host_sigma,host_epsilon);
85
86 this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
87 this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
88 host_cutsq,h_form);
89
90 lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
91 this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
92 host_cutsq,h_form);
93
94 lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
95 this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
96 host_offset);
97
98 dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
99 dev_error.zero();
100
101 // Allocate, cast and asynchronous memcpy of constant data
102 // Copy data for bonded interactions
103 special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
104 host_write[0]=static_cast<numtyp>(host_special_lj[0]);
105 host_write[1]=static_cast<numtyp>(host_special_lj[1]);
106 host_write[2]=static_cast<numtyp>(host_special_lj[2]);
107 host_write[3]=static_cast<numtyp>(host_special_lj[3]);
108 ucl_copy(special_lj,host_write,4,false);
109
110 // Copy shape, well, sigma, epsilon, and cutsq onto GPU
111 // - cast if necessary
112 shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
113 for (int i=0; i<ntypes; i++) {
114 host_write[i*4]=host_shape[i][0];
115 host_write[i*4+1]=host_shape[i][1];
116 host_write[i*4+2]=host_shape[i][2];
117 }
118 UCL_H_Vec<numtyp4> view4;
119 view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
120 ucl_copy(shape,view4,false);
121
122 well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
123 for (int i=0; i<ntypes; i++) {
124 host_write[i*4]=host_well[i][0];
125 host_write[i*4+1]=host_well[i][1];
126 host_write[i*4+2]=host_well[i][2];
127 }
128 view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
129 ucl_copy(well,view4,false);
130
131 _allocated=true;
132 this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
133 lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
134 shape.row_bytes()+well.row_bytes();
135
136 return 0;
137 }
138
139 template <class numtyp, class acctyp>
clear()140 void RESquaredT::clear() {
141 if (!_allocated)
142 return;
143
144 UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
145 ucl_copy(err_flag,dev_error,false);
146 if (err_flag[0] == 2)
147 std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
148 err_flag.clear();
149
150 _allocated=false;
151
152 dev_error.clear();
153 lj1.clear();
154 lj3.clear();
155 sigma_epsilon.clear();
156 this->cut_form.clear();
157
158 shape.clear();
159 well.clear();
160 special_lj.clear();
161
162 this->clear_base();
163 }
164
165 template <class numtyp, class acctyp>
host_memory_usage() const166 double RESquaredT::host_memory_usage() const {
167 return this->host_memory_usage_base()+sizeof(RESquaredT)+
168 4*sizeof(numtyp);
169 }
170
171 // ---------------------------------------------------------------------------
172 // Calculate energies, forces, and torques
173 // ---------------------------------------------------------------------------
174 template <class numtyp, class acctyp>
loop(const bool _eflag,const bool _vflag)175 void RESquaredT::loop(const bool _eflag, const bool _vflag) {
176 const int BX=this->block_size();
177 int eflag, vflag;
178 if (_eflag)
179 eflag=1;
180 else
181 eflag=0;
182
183 if (_vflag)
184 vflag=1;
185 else
186 vflag=0;
187
188 int GX=0, NGX;
189 int stride=this->nbor->nbor_pitch();
190 int ainum=this->ans->inum();
191
192 if (this->_multiple_forms) {
193 if (this->_last_ellipse>0) {
194 // ------------ ELLIPSE_ELLIPSE ---------------
195 this->time_nbor1.start();
196 GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
197 (BX/this->_threads_per_atom)));
198 NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
199 this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
200 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
201 this->time_nbor1.stop();
202
203 this->time_ellipsoid.start();
204 this->k_ellipsoid.set_size(GX,BX);
205 this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
206 &this->shape, &this->well, &this->special_lj,
207 &this->sigma_epsilon, &this->_lj_types,
208 &this->nbor->dev_nbor, &stride,
209 &this->ans->force,&ainum, &this->ans->engv,
210 &this->dev_error, &eflag, &vflag,
211 &this->_last_ellipse, &this->_threads_per_atom);
212 this->time_ellipsoid.stop();
213
214 // ------------ ELLIPSE_SPHERE ---------------
215 this->time_nbor2.start();
216 this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
217 ELLIPSE_SPHERE,_shared_types,_lj_types);
218 this->time_nbor2.stop();
219
220 this->time_ellipsoid2.start();
221 this->k_ellipsoid_sphere.set_size(GX,BX);
222 this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
223 &this->shape, &this->well, &this->special_lj,
224 &this->sigma_epsilon, &this->_lj_types,
225 &this->nbor->dev_nbor, &stride,
226 &this->ans->force,&ainum,
227 &this->ans->engv, &this->dev_error,
228 &eflag, &vflag, &this->_last_ellipse,
229 &this->_threads_per_atom);
230 this->time_ellipsoid2.stop();
231
232 if (this->_last_ellipse==this->ans->inum()) {
233 this->time_nbor3.zero();
234 this->time_ellipsoid3.zero();
235 this->time_lj.zero();
236 return;
237 }
238
239 // ------------ SPHERE_ELLIPSE ---------------
240
241 this->time_nbor3.start();
242 GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
243 this->_last_ellipse)/
244 (BX/this->_threads_per_atom)));
245 NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
246 this->_last_ellipse)/BX));
247 this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
248 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
249 this->time_nbor3.stop();
250
251 this->time_ellipsoid3.start();
252 this->k_sphere_ellipsoid.set_size(GX,BX);
253 this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
254 &this->shape, &this->well, &this->special_lj,
255 &this->sigma_epsilon, &this->_lj_types,
256 &this->nbor->dev_nbor, &stride,
257 &this->ans->force, &this->ans->engv,
258 &this->dev_error, &eflag, &vflag,
259 &this->_last_ellipse, &ainum,
260 &this->_threads_per_atom);
261 this->time_ellipsoid3.stop();
262 } else {
263 GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
264 this->_last_ellipse)/
265 (BX/this->_threads_per_atom)));
266 this->ans->force.zero();
267 this->ans->engv.zero();
268 this->time_nbor1.zero();
269 this->time_ellipsoid.zero();
270 this->time_nbor2.zero();
271 this->time_ellipsoid2.zero();
272 this->time_nbor3.zero();
273 this->time_ellipsoid3.zero();
274 }
275
276 // ------------ LJ ---------------
277 this->time_lj.start();
278 if (this->_last_ellipse<this->ans->inum()) {
279 if (this->_shared_types) {
280 this->k_lj_fast.set_size(GX,BX);
281 this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
282 &this->special_lj, &stride,
283 &this->nbor->dev_packed, &this->ans->force,
284 &this->ans->engv, &this->dev_error,
285 &eflag, &vflag, &this->_last_ellipse, &ainum,
286 &this->_threads_per_atom);
287 } else {
288 this->k_lj.set_size(GX,BX);
289 this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
290 &this->_lj_types, &this->special_lj, &stride,
291 &this->nbor->dev_packed, &this->ans->force,
292 &this->ans->engv, &this->dev_error, &eflag, &vflag,
293 &this->_last_ellipse, &ainum, &this->_threads_per_atom);
294 }
295 }
296 this->time_lj.stop();
297 } else {
298 GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
299 (BX/this->_threads_per_atom)));
300 NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
301 this->time_nbor1.start();
302 this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
303 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
304 this->time_nbor1.stop();
305 this->time_ellipsoid.start();
306 this->k_ellipsoid.set_size(GX,BX);
307 this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
308 &this->shape, &this->well, &this->special_lj,
309 &this->sigma_epsilon, &this->_lj_types,
310 &this->nbor->dev_nbor, &stride, &this->ans->force,
311 &ainum, &this->ans->engv, &this->dev_error,
312 &eflag, &vflag, &ainum, &this->_threads_per_atom);
313 this->time_ellipsoid.stop();
314 }
315 }
316
317 template class RESquared<PRECISION,ACC_PRECISION>;
318
319