1 /***************************************************************************
2                                 re_squared.cpp
3                              -------------------
4                                W. Michael Brown
5 
6   Host code for RE-Squared potential acceleration
7 
8  __________________________________________________________________________
9     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10  __________________________________________________________________________
11 
12     begin                : Fri May 06 2011
13     email                : brownw@ornl.gov
14  ***************************************************************************/
15 
16 #if defined(USE_OPENCL)
17 #include "re_squared_cl.h"
18 #include "re_squared_lj_cl.h"
19 #elif defined(USE_CUDART)
20 const char *re_squared=0;
21 const char *re_squared_lj=0;
22 #else
23 #include "re_squared_cubin.h"
24 #include "re_squared_lj_cubin.h"
25 #endif
26 
27 #include "lal_re_squared.h"
28 #include <cassert>
29 using namespace LAMMPS_AL;
30 
31 #define RESquaredT RESquared<numtyp, acctyp>
32 extern Device<PRECISION,ACC_PRECISION> device;
33 
34 template <class numtyp, class acctyp>
RESquared()35 RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(),
36                                   _allocated(false) {
37 }
38 
39 template <class numtyp, class acctyp>
~RESquared()40 RESquaredT::~RESquared() {
41   clear();
42 }
43 
44 template <class numtyp, class acctyp>
bytes_per_atom(const int max_nbors) const45 int RESquaredT::bytes_per_atom(const int max_nbors) const {
46   return this->bytes_per_atom(max_nbors);
47 }
48 
49 template <class numtyp, class acctyp>
init(const int ntypes,double ** host_shape,double ** host_well,double ** host_cutsq,double ** host_sigma,double ** host_epsilon,int ** h_form,double ** host_lj1,double ** host_lj2,double ** host_lj3,double ** host_lj4,double ** host_offset,const double * host_special_lj,const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen)50 int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
51                      double **host_cutsq, double **host_sigma,
52                      double **host_epsilon, int **h_form, double **host_lj1,
53                      double **host_lj2, double **host_lj3, double **host_lj4,
54                      double **host_offset, const double *host_special_lj,
55                      const int nlocal, const int nall, const int max_nbors,
56                      const int maxspecial, const double cell_size,
57                      const double gpu_split, FILE *_screen) {
58   int success;
59   success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
60                           _screen,ntypes,h_form,re_squared,re_squared_lj,
61                           "k_resquared",true);
62   if (success!=0)
63     return success;
64 
65   // If atom type constants fit in shared memory use fast kernel
66   int lj_types=ntypes;
67   _shared_types=false;
68   int max_shared_types=this->device->max_shared_types();
69   if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
70     lj_types=max_shared_types;
71     _shared_types=true;
72   }
73   _lj_types=lj_types;
74 
75   // Allocate a host write buffer for copying type data
76   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
77                                UCL_WRITE_ONLY);
78 
79   for (int i=0; i<lj_types*lj_types; i++)
80     host_write[i]=0.0;
81 
82   sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
83   this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
84 			 host_sigma,host_epsilon);
85 
86   this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
87   this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
88 			 host_cutsq,h_form);
89 
90   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
91   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
92 			 host_cutsq,h_form);
93 
94   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
95   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
96 		         host_offset);
97 
98   dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
99   dev_error.zero();
100 
101   // Allocate, cast and asynchronous memcpy of constant data
102   // Copy data for bonded interactions
103   special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
104   host_write[0]=static_cast<numtyp>(host_special_lj[0]);
105   host_write[1]=static_cast<numtyp>(host_special_lj[1]);
106   host_write[2]=static_cast<numtyp>(host_special_lj[2]);
107   host_write[3]=static_cast<numtyp>(host_special_lj[3]);
108   ucl_copy(special_lj,host_write,4,false);
109 
110   // Copy shape, well, sigma, epsilon, and cutsq onto GPU
111   // - cast if necessary
112   shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
113   for (int i=0; i<ntypes; i++) {
114     host_write[i*4]=host_shape[i][0];
115     host_write[i*4+1]=host_shape[i][1];
116     host_write[i*4+2]=host_shape[i][2];
117   }
118   UCL_H_Vec<numtyp4> view4;
119   view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
120   ucl_copy(shape,view4,false);
121 
122   well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
123   for (int i=0; i<ntypes; i++) {
124     host_write[i*4]=host_well[i][0];
125     host_write[i*4+1]=host_well[i][1];
126     host_write[i*4+2]=host_well[i][2];
127   }
128   view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
129   ucl_copy(well,view4,false);
130 
131   _allocated=true;
132   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
133                    lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
134                    shape.row_bytes()+well.row_bytes();
135 
136   return 0;
137 }
138 
139 template <class numtyp, class acctyp>
clear()140 void RESquaredT::clear() {
141   if (!_allocated)
142     return;
143 
144   UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
145   ucl_copy(err_flag,dev_error,false);
146   if (err_flag[0] == 2)
147     std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
148   err_flag.clear();
149 
150   _allocated=false;
151 
152   dev_error.clear();
153   lj1.clear();
154   lj3.clear();
155   sigma_epsilon.clear();
156   this->cut_form.clear();
157 
158   shape.clear();
159   well.clear();
160   special_lj.clear();
161 
162   this->clear_base();
163 }
164 
165 template <class numtyp, class acctyp>
host_memory_usage() const166 double RESquaredT::host_memory_usage() const {
167   return this->host_memory_usage_base()+sizeof(RESquaredT)+
168          4*sizeof(numtyp);
169 }
170 
171 // ---------------------------------------------------------------------------
172 // Calculate energies, forces, and torques
173 // ---------------------------------------------------------------------------
174 template <class numtyp, class acctyp>
loop(const bool _eflag,const bool _vflag)175 void RESquaredT::loop(const bool _eflag, const bool _vflag) {
176   const int BX=this->block_size();
177   int eflag, vflag;
178   if (_eflag)
179     eflag=1;
180   else
181     eflag=0;
182 
183   if (_vflag)
184     vflag=1;
185   else
186     vflag=0;
187 
188   int GX=0, NGX;
189   int stride=this->nbor->nbor_pitch();
190   int ainum=this->ans->inum();
191 
192   if (this->_multiple_forms) {
193     if (this->_last_ellipse>0) {
194       // ------------ ELLIPSE_ELLIPSE ---------------
195       this->time_nbor1.start();
196       GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
197                                (BX/this->_threads_per_atom)));
198       NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
199       this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
200 			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
201       this->time_nbor1.stop();
202 
203       this->time_ellipsoid.start();
204       this->k_ellipsoid.set_size(GX,BX);
205       this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
206                             &this->shape, &this->well, &this->special_lj,
207                             &this->sigma_epsilon, &this->_lj_types,
208                             &this->nbor->dev_nbor, &stride,
209                             &this->ans->force,&ainum, &this->ans->engv,
210                             &this->dev_error, &eflag, &vflag,
211                             &this->_last_ellipse, &this->_threads_per_atom);
212       this->time_ellipsoid.stop();
213 
214       // ------------ ELLIPSE_SPHERE ---------------
215       this->time_nbor2.start();
216       this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
217 			                 ELLIPSE_SPHERE,_shared_types,_lj_types);
218       this->time_nbor2.stop();
219 
220       this->time_ellipsoid2.start();
221       this->k_ellipsoid_sphere.set_size(GX,BX);
222       this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
223                                    &this->shape, &this->well, &this->special_lj,
224                                    &this->sigma_epsilon, &this->_lj_types,
225                                    &this->nbor->dev_nbor, &stride,
226                                    &this->ans->force,&ainum,
227                                    &this->ans->engv, &this->dev_error,
228                                    &eflag, &vflag, &this->_last_ellipse,
229                                    &this->_threads_per_atom);
230       this->time_ellipsoid2.stop();
231 
232       if (this->_last_ellipse==this->ans->inum()) {
233         this->time_nbor3.zero();
234         this->time_ellipsoid3.zero();
235         this->time_lj.zero();
236         return;
237       }
238 
239       // ------------ SPHERE_ELLIPSE ---------------
240 
241       this->time_nbor3.start();
242       GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
243                                this->_last_ellipse)/
244                                (BX/this->_threads_per_atom)));
245       NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
246                                this->_last_ellipse)/BX));
247       this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
248 			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
249       this->time_nbor3.stop();
250 
251       this->time_ellipsoid3.start();
252       this->k_sphere_ellipsoid.set_size(GX,BX);
253       this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
254                                    &this->shape, &this->well, &this->special_lj,
255                                    &this->sigma_epsilon, &this->_lj_types,
256                                    &this->nbor->dev_nbor, &stride,
257                                    &this->ans->force, &this->ans->engv,
258                                    &this->dev_error, &eflag, &vflag,
259                                    &this->_last_ellipse, &ainum,
260                                    &this->_threads_per_atom);
261       this->time_ellipsoid3.stop();
262    } else {
263       GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
264                                this->_last_ellipse)/
265                                (BX/this->_threads_per_atom)));
266       this->ans->force.zero();
267       this->ans->engv.zero();
268       this->time_nbor1.zero();
269       this->time_ellipsoid.zero();
270       this->time_nbor2.zero();
271       this->time_ellipsoid2.zero();
272       this->time_nbor3.zero();
273       this->time_ellipsoid3.zero();
274     }
275 
276     // ------------         LJ      ---------------
277     this->time_lj.start();
278     if (this->_last_ellipse<this->ans->inum()) {
279       if (this->_shared_types) {
280         this->k_lj_fast.set_size(GX,BX);
281         this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
282                             &this->special_lj, &stride,
283                             &this->nbor->dev_packed, &this->ans->force,
284                             &this->ans->engv, &this->dev_error,
285                             &eflag, &vflag, &this->_last_ellipse, &ainum,
286                             &this->_threads_per_atom);
287       } else {
288         this->k_lj.set_size(GX,BX);
289         this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
290                        &this->_lj_types, &this->special_lj, &stride,
291                        &this->nbor->dev_packed, &this->ans->force,
292                        &this->ans->engv, &this->dev_error, &eflag, &vflag,
293                        &this->_last_ellipse, &ainum, &this->_threads_per_atom);
294       }
295     }
296     this->time_lj.stop();
297   } else {
298     GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
299                              (BX/this->_threads_per_atom)));
300     NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
301     this->time_nbor1.start();
302     this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
303 		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
304     this->time_nbor1.stop();
305     this->time_ellipsoid.start();
306     this->k_ellipsoid.set_size(GX,BX);
307     this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
308                           &this->shape, &this->well, &this->special_lj,
309                           &this->sigma_epsilon, &this->_lj_types,
310                           &this->nbor->dev_nbor, &stride, &this->ans->force,
311                           &ainum,  &this->ans->engv, &this->dev_error,
312                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
313     this->time_ellipsoid.stop();
314   }
315 }
316 
317 template class RESquared<PRECISION,ACC_PRECISION>;
318 
319