1 /***************************************************************************
2                                    ufm.cpp
3                              -------------------
4                             Rodolfo Paula Leite (Unicamp/Brazil)
5                             Maurice de Koning (Unicamp/Brazil)
6 
7   Class for acceleration of the ufm pair style.
8 
9  __________________________________________________________________________
10     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
11  __________________________________________________________________________
12 
13     begin                :
14     email                : pl.rodolfo@gmail.com
15                            dekoning@ifi.unicamp.br
16  ***************************************************************************/
17 
18 #if defined(USE_OPENCL)
19 #include "ufm_cl.h"
20 #elif defined(USE_CUDART)
21 const char *ufm=0;
22 #else
23 #include "ufm_cubin.h"
24 #endif
25 
26 #include "lal_ufm.h"
27 #include <cassert>
28 namespace LAMMPS_AL {
29 #define UFMT UFM<numtyp, acctyp>
30 
31 extern Device<PRECISION,ACC_PRECISION> device;
32 
33 template <class numtyp, class acctyp>
UFM()34 UFMT::UFM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
35 }
36 
37 template <class numtyp, class acctyp>
~UFM()38 UFMT::~UFM() {
39   clear();
40 }
41 
42 template <class numtyp, class acctyp>
bytes_per_atom(const int max_nbors) const43 int UFMT::bytes_per_atom(const int max_nbors) const {
44   return this->bytes_per_atom_atomic(max_nbors);
45 }
46 
47 template <class numtyp, class acctyp>
init(const int ntypes,double ** host_cutsq,double ** host_uf1,double ** host_uf2,double ** host_uf3,double ** host_offset,double * host_special_lj,const int nlocal,const int nall,const int max_nbors,const int maxspecial,const double cell_size,const double gpu_split,FILE * _screen)48 int UFMT::init(const int ntypes,
49                           double **host_cutsq, double **host_uf1,
50                           double **host_uf2, double **host_uf3,
51                           double **host_offset,
52                           double *host_special_lj, const int nlocal,
53                           const int nall, const int max_nbors,
54                           const int maxspecial, const double cell_size,
55                           const double gpu_split, FILE *_screen) {
56   int success;
57   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
58                             _screen,ufm,"k_ufm");
59   if (success!=0)
60     return success;
61 
62   // If atom type constants fit in shared memory use fast kernel
63   int lj_types=ntypes;
64   shared_types=false;
65   int max_shared_types=this->device->max_shared_types();
66   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
67     lj_types=max_shared_types;
68     shared_types=true;
69   }
70   _lj_types=lj_types;
71 
72   // Allocate a host write buffer for data initialization
73   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
74                                UCL_WRITE_ONLY);
75 
76   for (int i=0; i<lj_types*lj_types; i++)
77     host_write[i]=0.0;
78 
79   uf1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
80   this->atom->type_pack4(ntypes,lj_types,uf1,host_write,host_uf1,host_uf2,
81                          host_cutsq);
82 
83   uf3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
84   this->atom->type_pack4(ntypes,lj_types,uf3,host_write,host_uf3,host_uf2,
85                          host_offset);
86 
87   UCL_H_Vec<double> dview;
88   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
89   dview.view(host_special_lj,4,*(this->ucl_device));
90   ucl_copy(sp_lj,dview,false);
91 
92   _allocated=true;
93   this->_max_bytes=uf1.row_bytes()+uf3.row_bytes()+sp_lj.row_bytes();
94   return 0;
95 }
96 
97 template <class numtyp, class acctyp>
reinit(const int ntypes,double ** host_cutsq,double ** host_uf1,double ** host_uf2,double ** host_uf3,double ** host_offset)98 void UFMT::reinit(const int ntypes, double **host_cutsq, double **host_uf1,
99                  double **host_uf2, double **host_uf3, double **host_offset) {
100   // Allocate a host write buffer for data initialization
101   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
102                                UCL_WRITE_ONLY);
103 
104   for (int i=0; i<_lj_types*_lj_types; i++)
105     host_write[i]=0.0;
106 
107   this->atom->type_pack4(ntypes,_lj_types,uf1,host_write,host_uf1,host_uf2,
108                          host_cutsq);
109   this->atom->type_pack4(ntypes,_lj_types,uf3,host_write,host_uf3,host_uf2,
110                          host_offset);
111 }
112 
113 template <class numtyp, class acctyp>
clear()114 void UFMT::clear() {
115   if (!_allocated)
116     return;
117   _allocated=false;
118 
119   uf1.clear();
120   uf3.clear();
121   sp_lj.clear();
122   this->clear_atomic();
123 }
124 
125 template <class numtyp, class acctyp>
host_memory_usage() const126 double UFMT::host_memory_usage() const {
127   return this->host_memory_usage_atomic()+sizeof(UFM<numtyp,acctyp>);
128 }
129 
130 // ---------------------------------------------------------------------------
131 // Calculate energies, forces, and torques
132 // ---------------------------------------------------------------------------
133 template <class numtyp, class acctyp>
loop(const int eflag,const int vflag)134 int UFMT::loop(const int eflag, const int vflag) {
135   // Compute the block size and grid size to keep all cores busy
136   const int BX=this->block_size();
137   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
138                                (BX/this->_threads_per_atom)));
139 
140   int ainum=this->ans->inum();
141   int nbor_pitch=this->nbor->nbor_pitch();
142   this->time_pair.start();
143   if (shared_types) {
144     this->k_pair_sel->set_size(GX,BX);
145     this->k_pair_sel->run(&this->atom->x, &uf1, &uf3, &sp_lj,
146                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
147                           &this->ans->force, &this->ans->engv, &eflag,
148                           &vflag, &ainum, &nbor_pitch,
149                           &this->_threads_per_atom);
150   } else {
151     this->k_pair.set_size(GX,BX);
152     this->k_pair.run(&this->atom->x, &uf1, &uf3, &_lj_types, &sp_lj,
153                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
154                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
155                      &ainum, &nbor_pitch, &this->_threads_per_atom);
156   }
157   this->time_pair.stop();
158   return GX;
159 }
160 
161 template class UFM<PRECISION,ACC_PRECISION>;
162 }
163