1 /***************************************************************************
2 pppm.cpp
3 -------------------
4 W. Michael Brown (ORNL)
5
6 Class for PPPM acceleration
7
8 __________________________________________________________________________
9 This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 __________________________________________________________________________
11
12 begin :
13 email : brownw@ornl.gov
14 ***************************************************************************/
15
16 #if defined(USE_OPENCL)
17 #include "pppm_cl.h"
18 #elif defined(USE_CUDART)
19 const char *pppm_f=0;
20 const char *pppm_d=0;
21 #else
22 #include "pppm_f_cubin.h"
23 #include "pppm_d_cubin.h"
24 #endif
25 #include "lal_pppm.h"
26 #include <cassert>
27
28 using namespace LAMMPS_AL;
29 #define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4>
30
31 extern Device<PRECISION,ACC_PRECISION> global_device;
32
33 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPM()34 PPPMT::PPPM() : _allocated(false), _compiled(false),
35 _max_bytes(0) {
36 device=&global_device;
37 ans=new Answer<numtyp,acctyp>();
38 }
39
40 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
~PPPM()41 PPPMT::~PPPM() {
42 clear(0.0);
43 delete ans;
44 }
45
46 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
bytes_per_atom() const47 int PPPMT::bytes_per_atom() const {
48 return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
49 }
50
51 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
52 grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
53 const int order, const int nxlo_out,
54 const int nylo_out, const int nzlo_out,
55 const int nxhi_out, const int nyhi_out,
56 const int nzhi_out, grdtyp **rho_coeff,
57 grdtyp **vd_brick_p, const double slab_volfactor,
58 const int nx_pppm, const int ny_pppm,
59 const int nz_pppm, const bool split, int &flag) {
60 _max_bytes=10;
61 screen=_screen;
62 _kspace_split=split;
63 bool success=true;
64
65 flag=device->init(*ans,nlocal,nall);
66 if (flag!=0)
67 return 0;
68 if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
69 flag=-5;
70 return 0;
71 }
72 if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
73 flag=-4;
74 return 0;
75 }
76
77 ucl_device=device->gpu;
78 atom=&device->atom;
79
80 _block_size=device->pppm_block();
81 _pencil_size=device->num_mem_threads();
82 _block_pencils=_block_size/_pencil_size;
83
84 compile_kernels(*ucl_device);
85
86 // Initialize timers for the selected GPU
87 time_in.init(*ucl_device);
88 time_in.zero();
89 time_out.init(*ucl_device);
90 time_out.zero();
91 time_map.init(*ucl_device);
92 time_map.zero();
93 time_rho.init(*ucl_device);
94 time_rho.zero();
95 time_interp.init(*ucl_device);
96 time_interp.zero();
97
98 pos_tex.bind_float(atom->x,4);
99 q_tex.bind_float(atom->q,1);
100
101 _allocated=true;
102 _max_bytes=0;
103 _max_an_bytes=ans->gpu_bytes();
104
105 _order=order;
106 _order_m_1=order-1;
107 _order2=_order_m_1*_order;
108 _nlower=-(_order-1)/2;
109 _nupper=order/2;
110 _nxlo_out=nxlo_out;
111 _nylo_out=nylo_out;
112 _nzlo_out=nzlo_out;
113 _nxhi_out=nxhi_out;
114 _nyhi_out=nyhi_out;
115 _nzhi_out=nzhi_out;
116
117 _slab_volfactor=slab_volfactor;
118 _nx_pppm=nx_pppm;
119 _ny_pppm=ny_pppm;
120 _nz_pppm=nz_pppm;
121
122 _max_brick_atoms=10;
123
124 // Get rho_coeff on device
125 int n2lo=(1-order)/2;
126 int numel=order*( order/2 - n2lo + 1 );
127 success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
128 UCL_SUCCESS);
129 UCL_H_Vec<grdtyp> view;
130 view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
131 ucl_copy(d_rho_coeff,view,true);
132 _max_bytes+=d_rho_coeff.row_bytes();
133
134 // Allocate storage for grid
135 _npts_x=nxhi_out-nxlo_out+1;
136 _npts_y=nyhi_out-nylo_out+1;
137 _npts_z=nzhi_out-nzlo_out+1;
138 _npts_yx=_npts_x*_npts_y;
139 success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
140 UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
141 success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
142 UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
143 *vd_brick_p=vd_brick.host.begin();
144 _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
145
146 // Allocate vector with count of atoms assigned to each grid point
147 _nlocal_x=_npts_x+_nlower-_nupper;
148 _nlocal_y=_npts_y+_nlower-_nupper;
149 _nlocal_z=_npts_z+_nlower-_nupper;
150 _nlocal_yx=_nlocal_x*_nlocal_y;
151 _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
152 success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
153 UCL_SUCCESS);
154 _max_bytes+=d_brick_counts.row_bytes();
155
156 // Allocate storage for atoms assigned to each grid point
157 success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
158 *ucl_device)==UCL_SUCCESS);
159 _max_bytes+=d_brick_atoms.row_bytes();
160
161 // Allocate error flags for checking out of bounds atoms
162 success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
163 UCL_WRITE_ONLY)==UCL_SUCCESS);
164 if (!success) {
165 flag=-3;
166 return 0;
167 }
168
169 error_flag.device.zero();
170 _max_bytes+=1;
171
172 _cpu_idle_time=0.0;
173
174 return brick.host.begin();
175 }
176
177 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
clear(const double cpu_time)178 void PPPMT::clear(const double cpu_time) {
179 if (!_allocated)
180 return;
181 _allocated=false;
182 _precompute_done=false;
183
184 brick.clear();
185 vd_brick.clear();
186 d_brick_counts.clear();
187 error_flag.clear();
188 d_brick_atoms.clear();
189
190 acc_timers();
191 device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
192 *ans,_max_bytes+_max_an_bytes,cpu_time,
193 _cpu_idle_time,screen);
194
195 if (_compiled) {
196 k_particle_map.clear();
197 k_make_rho.clear();
198 k_interp.clear();
199 delete pppm_program;
200 _compiled=false;
201 }
202
203 time_in.clear();
204 time_out.clear();
205 time_map.clear();
206 time_rho.clear();
207 time_interp.clear();
208
209 ans->clear();
210 device->clear();
211 }
212
213 // ---------------------------------------------------------------------------
214 // Charge assignment that can be performed asynchronously
215 // ---------------------------------------------------------------------------
216 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
_precompute(const int ago,const int nlocal,const int nall,double ** host_x,int * host_type,bool & success,double * host_q,double * boxlo,const double delxinv,const double delyinv,const double delzinv)217 void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
218 double **host_x, int *host_type, bool &success,
219 double *host_q, double *boxlo,
220 const double delxinv, const double delyinv,
221 const double delzinv) {
222 acc_timers();
223 if (nlocal==0) {
224 zero_timers();
225 return;
226 }
227
228 ans->inum(nlocal);
229
230 if (ago==0) {
231 resize_atom(nlocal,nall,success);
232 resize_local(nlocal,success);
233 if (!success)
234 return;
235
236 double bytes=ans->gpu_bytes();
237 if (bytes>_max_an_bytes)
238 _max_an_bytes=bytes;
239 }
240
241 atom->cast_x_data(host_x,host_type);
242 atom->cast_q_data(host_q);
243 atom->add_x_data(host_x,host_type);
244 atom->add_q_data();
245
246 time_map.start();
247
248 // Compute the block size and grid size to keep all cores busy
249 int BX=this->block_size();
250 int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
251
252 int ainum=this->ans->inum();
253
254 // Boxlo adjusted to be upper left brick and shift for even spline order
255 double shift=0.0;
256 if (_order % 2)
257 shift=0.5;
258 _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
259 _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
260 _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
261
262 _delxinv=delxinv;
263 _delyinv=delyinv;
264 _delzinv=delzinv;
265 double delvolinv = delxinv*delyinv*delzinv;
266 grdtyp f_delvolinv = delvolinv;
267
268 device->zero(d_brick_counts,d_brick_counts.numel());
269 k_particle_map.set_size(GX,BX);
270 k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
271 &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
272 &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
273 &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
274 &error_flag);
275 time_map.stop();
276
277 time_rho.start();
278 BX=block_size();
279
280 GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
281 _block_pencils));
282 k_make_rho.set_size(GX,BX);
283 k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
284 &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
285 &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
286 time_rho.stop();
287
288 time_out.start();
289 brick.update_host(_npts_yx*_npts_z,true);
290 error_flag.update_host(true);
291 time_out.stop();
292
293 _precompute_done=true;
294 }
295
296 // ---------------------------------------------------------------------------
297 // Charge spreading stuff
298 // ---------------------------------------------------------------------------
299 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
spread(const int ago,const int nlocal,const int nall,double ** host_x,int * host_type,bool & success,double * host_q,double * boxlo,const double delxinv,const double delyinv,const double delzinv)300 int PPPMT::spread(const int ago, const int nlocal, const int nall,
301 double **host_x, int *host_type, bool &success,
302 double *host_q, double *boxlo,
303 const double delxinv, const double delyinv,
304 const double delzinv) {
305 if (_precompute_done==false) {
306 atom->acc_timers();
307 _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
308 delyinv,delzinv);
309 }
310
311 device->stop_host_timer();
312
313 if (!success || nlocal==0)
314 return 0;
315
316 double t=MPI_Wtime();
317 time_out.sync_stop();
318 _cpu_idle_time+=MPI_Wtime()-t;
319
320 _precompute_done=false;
321
322 if (error_flag[0]==2) {
323 // Not enough storage for atoms on the brick
324 _max_brick_atoms*=2;
325 error_flag.device.zero();
326 d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
327 _max_bytes+=d_brick_atoms.row_bytes();
328 return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
329 delxinv,delyinv,delzinv);
330 }
331
332 return error_flag[0];
333 }
334
335 // ---------------------------------------------------------------------------
336 // Charge spreading stuff
337 // ---------------------------------------------------------------------------
338 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
interp(const grdtyp qqrd2e_scale)339 void PPPMT::interp(const grdtyp qqrd2e_scale) {
340 time_in.start();
341 vd_brick.update_device(true);
342 time_in.stop();
343
344 time_interp.start();
345 // Compute the block size and grid size to keep all cores busy
346 int BX=this->block_size();
347 int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
348
349 int ainum=this->ans->inum();
350
351 k_interp.set_size(GX,BX);
352 k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
353 &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
354 &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
355 &ans->force);
356 time_interp.stop();
357
358 ans->copy_answers(false,false,false,false);
359 if (_kspace_split==false)
360 device->add_ans_object(ans);
361 }
362
363 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
host_memory_usage() const364 double PPPMT::host_memory_usage() const {
365 return device->atom.host_memory_usage()+
366 sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>);
367 }
368
369 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
compile_kernels(UCL_Device & dev)370 void PPPMT::compile_kernels(UCL_Device &dev) {
371 if (_compiled)
372 return;
373
374 if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
375 return;
376
377 std::string flags=device->compile_string();
378 #ifdef USE_OPENCL
379 flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
380 ucl_template_name<grdtyp>()+"4";
381 #endif
382
383 pppm_program=new UCL_Program(dev);
384
385 #ifdef USE_OPENCL
386 pppm_program->load_string(pppm,flags.c_str());
387 #else
388 if (sizeof(grdtyp)==sizeof(float))
389 pppm_program->load_string(pppm_f,flags.c_str());
390 else
391 pppm_program->load_string(pppm_d,flags.c_str());
392 #endif
393
394 k_particle_map.set_function(*pppm_program,"particle_map");
395 k_make_rho.set_function(*pppm_program,"make_rho");
396 k_interp.set_function(*pppm_program,"interp");
397 pos_tex.get_texture(*pppm_program,"pos_tex");
398 q_tex.get_texture(*pppm_program,"q_tex");
399
400 _compiled=true;
401 }
402
403 template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
404 template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
405