1 /***************************************************************************
2                                   pppm.cpp
3                              -------------------
4                             W. Michael Brown (ORNL)
5 
6   Class for PPPM acceleration
7 
8  __________________________________________________________________________
9     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10  __________________________________________________________________________
11 
12     begin                :
13     email                : brownw@ornl.gov
14  ***************************************************************************/
15 
16 #if defined(USE_OPENCL)
17 #include "pppm_cl.h"
18 #elif defined(USE_CUDART)
19 const char *pppm_f=0;
20 const char *pppm_d=0;
21 #else
22 #include "pppm_f_cubin.h"
23 #include "pppm_d_cubin.h"
24 #endif
25 #include "lal_pppm.h"
26 #include <cassert>
27 
28 using namespace LAMMPS_AL;
29 #define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4>
30 
31 extern Device<PRECISION,ACC_PRECISION> global_device;
32 
33 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPM()34 PPPMT::PPPM() : _allocated(false), _compiled(false),
35                                   _max_bytes(0) {
36   device=&global_device;
37   ans=new Answer<numtyp,acctyp>();
38 }
39 
40 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
~PPPM()41 PPPMT::~PPPM() {
42   clear(0.0);
43   delete ans;
44 }
45 
46 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
bytes_per_atom() const47 int PPPMT::bytes_per_atom() const {
48   return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
49 }
50 
51 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
52 grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
53                               const int order, const int nxlo_out,
54                               const int nylo_out, const int nzlo_out,
55                               const int nxhi_out, const int nyhi_out,
56                               const int nzhi_out, grdtyp **rho_coeff,
57                               grdtyp **vd_brick_p, const double slab_volfactor,
58                               const int nx_pppm, const int ny_pppm,
59                               const int nz_pppm, const bool split, int &flag) {
60   _max_bytes=10;
61   screen=_screen;
62   _kspace_split=split;
63   bool success=true;
64 
65   flag=device->init(*ans,nlocal,nall);
66   if (flag!=0)
67     return 0;
68   if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
69     flag=-5;
70     return 0;
71   }
72   if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
73     flag=-4;
74     return 0;
75   }
76 
77   ucl_device=device->gpu;
78   atom=&device->atom;
79 
80   _block_size=device->pppm_block();
81   _pencil_size=device->num_mem_threads();
82   _block_pencils=_block_size/_pencil_size;
83 
84   compile_kernels(*ucl_device);
85 
86   // Initialize timers for the selected GPU
87   time_in.init(*ucl_device);
88   time_in.zero();
89   time_out.init(*ucl_device);
90   time_out.zero();
91   time_map.init(*ucl_device);
92   time_map.zero();
93   time_rho.init(*ucl_device);
94   time_rho.zero();
95   time_interp.init(*ucl_device);
96   time_interp.zero();
97 
98   pos_tex.bind_float(atom->x,4);
99   q_tex.bind_float(atom->q,1);
100 
101   _allocated=true;
102   _max_bytes=0;
103   _max_an_bytes=ans->gpu_bytes();
104 
105   _order=order;
106   _order_m_1=order-1;
107   _order2=_order_m_1*_order;
108   _nlower=-(_order-1)/2;
109   _nupper=order/2;
110   _nxlo_out=nxlo_out;
111   _nylo_out=nylo_out;
112   _nzlo_out=nzlo_out;
113   _nxhi_out=nxhi_out;
114   _nyhi_out=nyhi_out;
115   _nzhi_out=nzhi_out;
116 
117   _slab_volfactor=slab_volfactor;
118   _nx_pppm=nx_pppm;
119   _ny_pppm=ny_pppm;
120   _nz_pppm=nz_pppm;
121 
122   _max_brick_atoms=10;
123 
124   // Get rho_coeff on device
125   int n2lo=(1-order)/2;
126   int numel=order*( order/2 - n2lo + 1 );
127   success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
128                       UCL_SUCCESS);
129   UCL_H_Vec<grdtyp> view;
130   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
131   ucl_copy(d_rho_coeff,view,true);
132   _max_bytes+=d_rho_coeff.row_bytes();
133 
134   // Allocate storage for grid
135   _npts_x=nxhi_out-nxlo_out+1;
136   _npts_y=nyhi_out-nylo_out+1;
137   _npts_z=nzhi_out-nzlo_out+1;
138   _npts_yx=_npts_x*_npts_y;
139   success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
140                                   UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
141   success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
142                                     UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
143   *vd_brick_p=vd_brick.host.begin();
144   _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
145 
146   // Allocate vector with count of atoms assigned to each grid point
147   _nlocal_x=_npts_x+_nlower-_nupper;
148   _nlocal_y=_npts_y+_nlower-_nupper;
149   _nlocal_z=_npts_z+_nlower-_nupper;
150   _nlocal_yx=_nlocal_x*_nlocal_y;
151   _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
152   success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
153                       UCL_SUCCESS);
154   _max_bytes+=d_brick_counts.row_bytes();
155 
156   // Allocate storage for atoms assigned to each grid point
157   success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
158                                           *ucl_device)==UCL_SUCCESS);
159   _max_bytes+=d_brick_atoms.row_bytes();
160 
161   // Allocate error flags for checking out of bounds atoms
162   success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
163                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
164   if (!success) {
165     flag=-3;
166     return 0;
167   }
168 
169   error_flag.device.zero();
170   _max_bytes+=1;
171 
172   _cpu_idle_time=0.0;
173 
174   return brick.host.begin();
175 }
176 
177 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
clear(const double cpu_time)178 void PPPMT::clear(const double cpu_time) {
179   if (!_allocated)
180     return;
181   _allocated=false;
182   _precompute_done=false;
183 
184   brick.clear();
185   vd_brick.clear();
186   d_brick_counts.clear();
187   error_flag.clear();
188   d_brick_atoms.clear();
189 
190   acc_timers();
191   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
192                               *ans,_max_bytes+_max_an_bytes,cpu_time,
193                               _cpu_idle_time,screen);
194 
195   if (_compiled) {
196     k_particle_map.clear();
197     k_make_rho.clear();
198     k_interp.clear();
199     delete pppm_program;
200     _compiled=false;
201   }
202 
203   time_in.clear();
204   time_out.clear();
205   time_map.clear();
206   time_rho.clear();
207   time_interp.clear();
208 
209   ans->clear();
210   device->clear();
211 }
212 
213 // ---------------------------------------------------------------------------
214 // Charge assignment that can be performed asynchronously
215 // ---------------------------------------------------------------------------
216 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
_precompute(const int ago,const int nlocal,const int nall,double ** host_x,int * host_type,bool & success,double * host_q,double * boxlo,const double delxinv,const double delyinv,const double delzinv)217 void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
218                                  double **host_x, int *host_type, bool &success,
219                                  double *host_q, double *boxlo,
220                                  const double delxinv, const double delyinv,
221                                  const double delzinv) {
222   acc_timers();
223   if (nlocal==0) {
224     zero_timers();
225     return;
226   }
227 
228   ans->inum(nlocal);
229 
230   if (ago==0) {
231     resize_atom(nlocal,nall,success);
232     resize_local(nlocal,success);
233     if (!success)
234       return;
235 
236     double bytes=ans->gpu_bytes();
237     if (bytes>_max_an_bytes)
238       _max_an_bytes=bytes;
239   }
240 
241   atom->cast_x_data(host_x,host_type);
242   atom->cast_q_data(host_q);
243   atom->add_x_data(host_x,host_type);
244   atom->add_q_data();
245 
246   time_map.start();
247 
248   // Compute the block size and grid size to keep all cores busy
249   int BX=this->block_size();
250   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
251 
252   int ainum=this->ans->inum();
253 
254   // Boxlo adjusted to be upper left brick and shift for even spline order
255   double shift=0.0;
256   if (_order % 2)
257     shift=0.5;
258   _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
259   _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
260   _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
261 
262   _delxinv=delxinv;
263   _delyinv=delyinv;
264   _delzinv=delzinv;
265   double delvolinv = delxinv*delyinv*delzinv;
266   grdtyp f_delvolinv = delvolinv;
267 
268   device->zero(d_brick_counts,d_brick_counts.numel());
269   k_particle_map.set_size(GX,BX);
270   k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
271                      &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
272                      &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
273                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
274                      &error_flag);
275   time_map.stop();
276 
277   time_rho.start();
278   BX=block_size();
279 
280   GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
281                       _block_pencils));
282   k_make_rho.set_size(GX,BX);
283   k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
284                  &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
285                  &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
286   time_rho.stop();
287 
288   time_out.start();
289   brick.update_host(_npts_yx*_npts_z,true);
290   error_flag.update_host(true);
291   time_out.stop();
292 
293   _precompute_done=true;
294 }
295 
296 // ---------------------------------------------------------------------------
297 // Charge spreading stuff
298 // ---------------------------------------------------------------------------
299 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
spread(const int ago,const int nlocal,const int nall,double ** host_x,int * host_type,bool & success,double * host_q,double * boxlo,const double delxinv,const double delyinv,const double delzinv)300 int PPPMT::spread(const int ago, const int nlocal, const int nall,
301                            double **host_x, int *host_type, bool &success,
302                            double *host_q, double *boxlo,
303                            const double delxinv, const double delyinv,
304                            const double delzinv) {
305   if (_precompute_done==false) {
306     atom->acc_timers();
307     _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
308                 delyinv,delzinv);
309   }
310 
311   device->stop_host_timer();
312 
313   if (!success || nlocal==0)
314     return 0;
315 
316   double t=MPI_Wtime();
317   time_out.sync_stop();
318   _cpu_idle_time+=MPI_Wtime()-t;
319 
320   _precompute_done=false;
321 
322   if (error_flag[0]==2) {
323     // Not enough storage for atoms on the brick
324     _max_brick_atoms*=2;
325     error_flag.device.zero();
326     d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
327     _max_bytes+=d_brick_atoms.row_bytes();
328     return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
329                   delxinv,delyinv,delzinv);
330   }
331 
332   return error_flag[0];
333 }
334 
335 // ---------------------------------------------------------------------------
336 // Charge spreading stuff
337 // ---------------------------------------------------------------------------
338 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
interp(const grdtyp qqrd2e_scale)339 void PPPMT::interp(const grdtyp qqrd2e_scale) {
340   time_in.start();
341   vd_brick.update_device(true);
342   time_in.stop();
343 
344   time_interp.start();
345   // Compute the block size and grid size to keep all cores busy
346   int BX=this->block_size();
347   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
348 
349   int ainum=this->ans->inum();
350 
351   k_interp.set_size(GX,BX);
352   k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
353                &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
354                &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
355                &ans->force);
356   time_interp.stop();
357 
358   ans->copy_answers(false,false,false,false);
359   if (_kspace_split==false)
360     device->add_ans_object(ans);
361 }
362 
363 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
host_memory_usage() const364 double PPPMT::host_memory_usage() const {
365   return device->atom.host_memory_usage()+
366          sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>);
367 }
368 
369 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
compile_kernels(UCL_Device & dev)370 void PPPMT::compile_kernels(UCL_Device &dev) {
371   if (_compiled)
372     return;
373 
374   if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
375     return;
376 
377   std::string flags=device->compile_string();
378   #ifdef USE_OPENCL
379   flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
380          ucl_template_name<grdtyp>()+"4";
381   #endif
382 
383   pppm_program=new UCL_Program(dev);
384 
385   #ifdef USE_OPENCL
386   pppm_program->load_string(pppm,flags.c_str());
387   #else
388   if (sizeof(grdtyp)==sizeof(float))
389     pppm_program->load_string(pppm_f,flags.c_str());
390   else
391     pppm_program->load_string(pppm_d,flags.c_str());
392   #endif
393 
394   k_particle_map.set_function(*pppm_program,"particle_map");
395   k_make_rho.set_function(*pppm_program,"make_rho");
396   k_interp.set_function(*pppm_program,"interp");
397   pos_tex.get_texture(*pppm_program,"pos_tex");
398   q_tex.get_texture(*pppm_program,"q_tex");
399 
400   _compiled=true;
401 }
402 
403 template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
404 template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
405