1 // clang-format off
2 /* -*- c++ -*- -------------------------------------------------------------
3    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4    https://www.lammps.org/, Sandia National Laboratories
5    Steve Plimpton, sjplimp@sandia.gov
6 
7    Copyright (2003) Sandia Corporation.  Under the terms of Contract
8    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9    certain rights in this software.  This software is distributed under
10    the GNU General Public License.
11 
12    See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14 
15 /* ----------------------------------------------------------------------
16    Contributing authors: Christian Trott (SNL), Stan Moore (SNL)
17 ------------------------------------------------------------------------- */
18 
19 #ifndef LMP_SNA_KOKKOS_H
20 #define LMP_SNA_KOKKOS_H
21 
22 #include <complex>
23 #include <ctime>
24 #include <Kokkos_Core.hpp>
25 #include "kokkos_type.h"
26 
27 #ifdef __SYCL_DEVICE_ONLY__
28 #include <CL/sycl.hpp>
29 #endif
30 
31 namespace LAMMPS_NS {
32 
33 template<typename real_type_, int vector_length_>
34 struct WignerWrapper {
35   using real_type = real_type_;
36   using complex = SNAComplex<real_type>;
37   static constexpr int vector_length = vector_length_;
38 
39   const int offset; // my offset into the vector (0, ..., vector_length - 1)
40   real_type* buffer; // buffer of real numbers
41 
42   KOKKOS_INLINE_FUNCTION
WignerWrapperWignerWrapper43   WignerWrapper(complex* buffer_, const int offset_)
44    : offset(offset_), buffer(reinterpret_cast<real_type*>(buffer_))
45   { ; }
46 
47   KOKKOS_INLINE_FUNCTION
getWignerWrapper48   complex get(const int& ma) const {
49     return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
50   }
51 
52   KOKKOS_INLINE_FUNCTION
setWignerWrapper53   void set(const int& ma, const complex& store) const {
54     buffer[offset + 2 * vector_length * ma] = store.re;
55     buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
56   }
57 };
58 
59 struct alignas(8) FullHalfMapper {
60   int idxu_half;
61   int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj
62 };
63 
64 template<class DeviceType, typename real_type_, int vector_length_>
65 class SNAKokkos {
66 
67 public:
68   using real_type = real_type_;
69   using complex = SNAComplex<real_type>;
70   static constexpr int vector_length = vector_length_;
71 
72   typedef Kokkos::View<int*, DeviceType> t_sna_1i;
73   typedef Kokkos::View<real_type*, DeviceType> t_sna_1d;
74   typedef Kokkos::View<real_type*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic;
75   typedef Kokkos::View<int**, DeviceType> t_sna_2i;
76   typedef Kokkos::View<real_type**, DeviceType> t_sna_2d;
77   typedef Kokkos::View<real_type**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
78   typedef Kokkos::View<real_type***, DeviceType> t_sna_3d;
79   typedef Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
80   typedef Kokkos::View<real_type***[3], DeviceType> t_sna_4d;
81   typedef Kokkos::View<real_type****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
82   typedef Kokkos::View<real_type**[3], DeviceType> t_sna_3d3;
83   typedef Kokkos::View<real_type*****, DeviceType> t_sna_5d;
84 
85   typedef Kokkos::View<complex*, DeviceType> t_sna_1c;
86   typedef Kokkos::View<complex*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
87   typedef Kokkos::View<complex**, DeviceType> t_sna_2c;
88   typedef Kokkos::View<complex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
89   typedef Kokkos::View<complex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
90   typedef Kokkos::View<complex***, DeviceType> t_sna_3c;
91   typedef Kokkos::View<complex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
92   typedef Kokkos::View<complex***[3], DeviceType> t_sna_4c;
93   typedef Kokkos::View<complex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
94   typedef Kokkos::View<complex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
95   typedef Kokkos::View<complex**[3], DeviceType> t_sna_3c3;
96   typedef Kokkos::View<complex*****, DeviceType> t_sna_5c;
97 
98 inline
SNAKokkos()99   SNAKokkos() {};
100   KOKKOS_INLINE_FUNCTION
101   SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
102 
103 inline
104   SNAKokkos(real_type, int, real_type, int, int, int, int, int, int);
105 
106   KOKKOS_INLINE_FUNCTION
107   ~SNAKokkos();
108 
109 inline
110   void build_indexlist(); // SNAKokkos()
111 
112 inline
113   void init();            //
114 
115   double memory_usage();
116 
117   int ncoeff;
118   int host_flag;
119 
120   // functions for bispectrum coefficients, GPU only
121   KOKKOS_INLINE_FUNCTION
122   void compute_cayley_klein(const int&, const int&, const int&);
123   KOKKOS_INLINE_FUNCTION
124   void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
125 
126   // version of the code with parallelism over j_bend
127   KOKKOS_INLINE_FUNCTION
128   void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
129   // version of the code without parallelism over j_bend
130   KOKKOS_INLINE_FUNCTION
131   void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
132 
133   KOKKOS_INLINE_FUNCTION
134   void compute_zi(const int&, const int&, const int&);    // ForceSNAP
135   KOKKOS_INLINE_FUNCTION
136   void compute_yi(int,int,int,
137    const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
138   KOKKOS_INLINE_FUNCTION
139   void compute_yi_with_zlist(int,int,int,
140    const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
141   KOKKOS_INLINE_FUNCTION
142   void compute_bi(const int&, const int&, const int&);    // ForceSNAP
143 
144   // functions for derivatives, GPU only
145   // version of the code with parallelism over j_bend
146   template<int dir>
147   KOKKOS_INLINE_FUNCTION
148   void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
149   // version of the code without parallelism over j_bend
150   template<int dir>
151   KOKKOS_INLINE_FUNCTION
152   void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
153 
154   // core "evaluation" functions that get plugged into "compute" functions
155   // plugged into compute_ui_small, compute_ui_large
156   KOKKOS_FORCEINLINE_FUNCTION
157   void evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, const int&,
158                         const int&, const int&, const int&);
159   // plugged into compute_zi, compute_yi
160   KOKKOS_FORCEINLINE_FUNCTION
161   complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
162                         const int&, const int&, const int&, const int&, const real_type*);
163   // plugged into compute_yi, compute_yi_with_zlist
164   KOKKOS_FORCEINLINE_FUNCTION
165   real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
166                         const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &);
167   // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
168   KOKKOS_FORCEINLINE_FUNCTION
169   real_type evaluate_duidrj_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
170                         const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
171                         const int&, const int&, const int&, const int&);
172 
173   // functions for bispectrum coefficients, CPU only
174   KOKKOS_INLINE_FUNCTION
175   void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
176   KOKKOS_INLINE_FUNCTION
177   void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
178   KOKKOS_INLINE_FUNCTION
179   void compute_zi_cpu(const int&);    // ForceSNAP
180   KOKKOS_INLINE_FUNCTION
181   void compute_yi_cpu(int,
182    const Kokkos::View<real_type**, DeviceType> &beta); // ForceSNAP
183     KOKKOS_INLINE_FUNCTION
184   void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
185 
186   // functions for derivatives, CPU only
187   KOKKOS_INLINE_FUNCTION
188   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
189   KOKKOS_INLINE_FUNCTION
190   void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
191 
192   KOKKOS_INLINE_FUNCTION
193   real_type compute_sfac(real_type, real_type); // add_uarraytot, compute_duarray
194 
195   KOKKOS_INLINE_FUNCTION
196   real_type compute_dsfac(real_type, real_type); // compute_duarray
197 
198   KOKKOS_INLINE_FUNCTION
199   void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein
200 
201 #ifdef TIMING_INFO
202   double* timers;
203   timespec starttime, endtime;
204   int print;
205   int counter;
206 #endif
207 
208   //per sna class instance for OMP use
209 
210   // Per InFlight Particle
211   t_sna_3d rij;
212   t_sna_2i inside;
213   t_sna_2d wj;
214   t_sna_2d rcutij;
215   t_sna_2i element;
216   t_sna_3d dedr;
217   int natom, nmax;
218 
219   void grow_rij(int, int);
220 
221   int twojmax, diagonalstyle;
222 
223   t_sna_3d blist;
224   t_sna_3c_ll ulisttot;
225   t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
226   t_sna_3c_ll zlist;
227 
228   t_sna_3c_ll ulist;
229   t_sna_3c_ll ylist;
230 
231   // derivatives of data
232   t_sna_4c3_ll dulist;
233 
234   // Modified structures for GPU backend
235   t_sna_3c_ll a_pack; // Cayley-Klein `a`
236   t_sna_3c_ll b_pack; // `b`
237   t_sna_4c_ll da_pack; // `da`
238   t_sna_4c_ll db_pack; // `db`
239   t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z}
240 
241   t_sna_4d_ll ulisttot_re_pack; // split real,
242   t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened
243   t_sna_4c_ll ulisttot_pack; // AoSoA layout
244   t_sna_4c_ll zlist_pack; // AoSoA layout
245   t_sna_4d_ll blist_pack;
246   t_sna_4d_ll ylist_pack_re; // split real,
247   t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
248 
249   int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;
250 
251   // Chem snap counts
252   int nelements;
253   int ndoubles;
254   int ntriples;
255 
256 private:
257   real_type rmin0, rfac0;
258 
259   //use indexlist instead of loops, constructor generates these
260   // Same across all SNAKokkos
261   Kokkos::View<int*[10], DeviceType> idxz;
262   Kokkos::View<int*[3], DeviceType> idxb;
263   Kokkos::View<int***, DeviceType> idxcg_block;
264 
265 public:
266   Kokkos::View<int*, DeviceType> idxu_block;
267   Kokkos::View<int*, DeviceType> idxu_half_block;
268   Kokkos::View<int*, DeviceType> idxu_cache_block;
269   Kokkos::View<FullHalfMapper*, DeviceType> idxu_full_half;
270 
271 private:
272   Kokkos::View<int***, DeviceType> idxz_block;
273   Kokkos::View<int***, DeviceType> idxb_block;
274 
275   // data for bispectrum coefficients
276 
277   // Same across all SNAKokkos
278   t_sna_1d cglist;
279   t_sna_2d rootpqarray;
280 
281   static const int nmaxfactorial = 167;
282   static const double nfac_table[];
283   inline
284   double factorial(int);
285 
286   KOKKOS_INLINE_FUNCTION
287   void create_team_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNAKokkos()
288   KOKKOS_INLINE_FUNCTION
289   void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNAKokkos()
290 
291 inline
292   void init_clebsch_gordan(); // init()
293 
294 inline
295   void init_rootpqarray();    // init()
296 
297   KOKKOS_INLINE_FUNCTION
298   void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, int); // compute_ui
299 
300   KOKKOS_INLINE_FUNCTION
301   void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
302                       const real_type&, const real_type&, const real_type&,
303                       const real_type&, const real_type&); // compute_ui_cpu
304 
305 
306   inline
307   double deltacg(int, int, int);  // init_clebsch_gordan
308 
309 inline
310   int compute_ncoeff();           // SNAKokkos()
311   KOKKOS_INLINE_FUNCTION
312   void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
313                        const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
314                        const real_type&, const real_type&, const real_type&, const real_type&, const real_type&);
315 
316   // Sets the style for the switching function
317   // 0 = none
318   // 1 = cosine
319   int switch_flag;
320 
321   // Chem snap flags
322   int chem_flag;
323   int bnorm_flag;
324 
325   // Self-weight
326   real_type wself;
327   int wselfall_flag;
328 
329   int bzero_flag; // 1 if bzero subtracted from barray
330   Kokkos::View<real_type*, DeviceType> bzero; // array of B values for isolated atoms
331 };
332 
333 }
334 
335 #include "sna_kokkos_impl.h"
336 #endif
337 
338 /* ERROR/WARNING messages:
339 
340 E: Invalid argument to factorial %d
341 
342 N must be >= 0 and <= 167, otherwise the factorial result is too
343 large.
344 
345 */
346