1 /*
2  *  Copyright (C) 2004-2021 Edward F. Valeev
3  *
4  *  This file is part of Libint.
5  *
6  *  Libint is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  Libint is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with Libint.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 #include "libint2/util/generated/libint2_params.h"
22 #ifndef LIBINT2_REALTYPE
23 #define LIBINT2_REALTYPE double
24 #endif
25 #include "libint2/boys.h"
26 #include "libint2/util/timer.h"
27 #include <ctime>
28 #include <cstdint>
29 #include <cstdlib>
30 #include <iostream>
31 #include <initializer_list>
32 #include <iterator>
33 #include <functional>
34 #include <numeric>
35 #include <cxxabi.h>
36 
37 #ifndef SKIP_AXPY
38 #  include <mkl_cblas.h>
39 typedef MKL_INT BLAS_INT;
40 #endif
41 
42 #ifdef SKIP_ALL
43 # define SKIP_AXPY
44 # define SKIP_DOT
45 # define SKIP_GEMM
46 # define SKIP_EXP
47 # define SKIP_SQRT
48 # define SKIP_ERF
49 # define SKIP_CHEBYSHEV
50 # define SKIP_TAYLOR
51 # define SKIP_STGNG
52 # define SKIP_YUKAWA
53 #endif
54 
55 #define AVOID_AUTO_VECTORIZATION 1 // set to 0 to measure performance within vectorizable loops
56                                    // the default is to measure performance of a single call
57 
58 using namespace std;
59 
60 libint2::Timers<1> timer;
61 
62 enum OperType {
63   f12, f12_2, f12_o_r12, f12_t_f12
64 };
to_string(OperType o)65 std::string to_string(OperType o) {
66   std::string result;
67   switch (o) {
68     case f12: result = "f12"; break;
69     case f12_2: result = "f12_2"; break;
70     case f12_o_r12: result = "f12_o_r12"; break;
71     case f12_t_f12: result = "f12_t_f12"; break;
72   }
73   return result;
74 }
75 
76 /* These state variables must be initialized so that they are not all zero. */
77 namespace dice {
78   uint32_t x, y, z, w;
init(unsigned int seed=159265359)79   void init(unsigned int seed = 159265359) {
80     srand(seed);
81     x=rand();
82     y=rand();
83     z=rand();
84     w=rand();
85   }
86   // uses xorshift128 algorithm, see http://en.wikipedia.org/wiki/Xorshift
random_unit32(void)87   uint32_t random_unit32(void) {
88     uint32_t t = x ^ (x << 11);
89     x = y; y = z; z = w;
90     return w = w ^ (w >> 19) ^ t ^ (t >> 8);
91   }
92 };
93 
94 /** profiles Kernel by running it repeatedly, and reporting the sum of values
95  *
96  * @param k the Kernel, needs to have 2 member functions: label and and eval, neither of which takes a parameter
97  * @param nrepeats the number of times the kernel is run
98  */
99 template <typename Kernel> void profile(const Kernel& k, int nrepeats);
100 
101 template <unsigned InterpolationOrder = 7>
102 void do_chebyshev(int mmax, int nrepeats);
103 template <unsigned InterpolationOrder = 7>
104 void do_taylor(int mmax, int nrepeats);
105 template <OperType O> void do_stg6g(int mmax, double T, double rho, int nrepeats);
106 template <bool exp = false>
107 void do_stg(int mmax, double T, double U, int nrepeats);
108 
109 template <typename Real, Real (Function)(Real) >
110 struct BasicKernel {
111     typedef Real ResultType;
BasicKernelBasicKernel112     BasicKernel(Real T, std::string label, double T_dec = 0.00001) : label_(label),
113         T_(T), T_dec_(T_dec), sum_(0) {}
labelBasicKernel114     std::string label() const { return label_; }
evalBasicKernel115     void eval() const {
116       sum_ += Function(T_);
117       T_ += T_dec_;
118     }
sumBasicKernel119     Real sum() const {
120       return sum_;
121     }
ops_per_evalBasicKernel122     size_t ops_per_eval() const {
123       return 1;
124     }
125     std::string label_;
126     mutable Real T_;
127     Real T_dec_;
128     mutable Real sum_;
129 };
130 
131 template <typename Real>
132 struct VectorOpKernel {
VectorOpKernelVectorOpKernel133     VectorOpKernel(size_t veclen,
134                    size_t nargs,
135                    Real T,
136                    std::string label) : result_(veclen, Real(0)), args_(nargs), label_(label) {
137       for(size_t i=0; i<args_.size(); ++i) {
138         args_[i] = new Real[veclen];
139         std::fill(args_[i], args_[i] + veclen, T);
140       }
141     }
VectorOpKernelVectorOpKernel142     VectorOpKernel(std::initializer_list<size_t> sizes, Real T,
143                    std::string label)
144         : result_(*sizes.begin(), Real(0)), args_(sizes.size() - 1), label_(label) {
145       auto size_iter = sizes.begin() + 1;
146       for (size_t i = 0; i < args_.size(); ++i, ++size_iter) {
147         args_[i] = new Real[*size_iter];
148         std::fill(args_[i], args_[i] + *size_iter, T);
149       }
150     }
~VectorOpKernelVectorOpKernel151     ~VectorOpKernel() {
152       for(size_t i=0; i<args_.size(); ++i) {
153         delete[] args_[i];
154       }
155     }
labelVectorOpKernel156     std::string label() const {
157       return label_;
158     }
159 
160     std::vector<Real> result_;
161     std::vector<Real*> args_;
162     std::string label_;
163 };
164 
165 template <typename Real>
166 struct AXPYKernel : public VectorOpKernel<Real> {
167   typedef Real ResultType;
AXPYKernelAXPYKernel168   AXPYKernel(size_t veclen,
169              Real a,
170              Real T,
171              std::string label) : VectorOpKernel<Real>(veclen, 1, T, label), a_(a) {
172     y_ = &VectorOpKernel<Real>::result_[0];
173     x_ = VectorOpKernel<Real>::args_[0];
174     n_ = VectorOpKernel<Real>::result_.size();
175   }
evalAXPYKernel176   void eval() const {
177 #pragma ivdep
178     for(size_t v=0; v<n_; ++v)
179       y_[v] += a_ * x_[v];
180   }
sumAXPYKernel181   Real sum() const {
182     return std::accumulate(y_, y_+n_, Real(0));
183   }
ops_per_evalAXPYKernel184   size_t ops_per_eval() const {
185     return n_ * 2;
186   }
187 
188   Real a_;
189   Real* y_;
190   Real const* x_;
191   BLAS_INT n_;
192 
193 };
194 
195 struct DAXPYKernel : public VectorOpKernel<double> {
196   typedef double ResultType;
197   typedef double Real;
DAXPYKernelDAXPYKernel198   DAXPYKernel(size_t veclen,
199               Real a,
200               Real T,
201               std::string label) : VectorOpKernel<Real>(veclen, 1, T, label), a_(a) {
202     y_ = &VectorOpKernel<Real>::result_[0];
203     x_ = VectorOpKernel<Real>::args_[0];
204     n_ = VectorOpKernel<Real>::result_.size();
205   }
evalDAXPYKernel206   void eval() const {
207     cblas_daxpy(n_, a_, x_, 1, y_, 1);
208   }
sumDAXPYKernel209   Real sum() const {
210     return std::accumulate(y_, y_+n_, Real(0));
211   }
ops_per_evalDAXPYKernel212   size_t ops_per_eval() const {
213     return n_ * 2;
214   }
215 
216   Real a_;
217   Real* y_;
218   Real* x_;
219   size_t n_;
220 
221 };
222 
223 template <typename Real>
224 struct DOTKernel : public VectorOpKernel<Real> {
225   typedef Real ResultType;
DOTKernelDOTKernel226   DOTKernel(size_t veclen,
227             Real T,
228             std::string label) : VectorOpKernel<Real>(veclen, 2, T, label), result_(0) {
229     x1_ = VectorOpKernel<Real>::args_[0];
230     x2_ = VectorOpKernel<Real>::args_[1];
231     n_ = VectorOpKernel<Real>::result_.size();
232   }
evalDOTKernel233   void eval() const {
234 #pragma ivdep
235     for(size_t v=0; v<n_; ++v)
236       result_ += x1_[v] * x2_[v];
237   }
sumDOTKernel238   Real sum() const {
239     return result_;
240   }
ops_per_evalDOTKernel241   size_t ops_per_eval() const {
242     return n_ * 2;
243   }
244 
245   mutable Real result_;
246   Real const* x1_;
247   Real const* x2_;
248   BLAS_INT n_;
249 
250 };
251 
252 struct DDOTKernel : public VectorOpKernel<double> {
253   typedef double ResultType;
254   typedef double Real;
DDOTKernelDDOTKernel255   DDOTKernel(size_t veclen,
256              Real T,
257              std::string label) : VectorOpKernel<Real>(veclen, 2, T, label), result_(0) {
258     x1_ = VectorOpKernel<Real>::args_[0];
259     x2_ = VectorOpKernel<Real>::args_[1];
260     n_ = VectorOpKernel<Real>::result_.size();
261   }
evalDDOTKernel262   void eval() const {
263     result_ += cblas_ddot(n_, x1_, 1, x2_, 1);
264   }
sumDDOTKernel265   Real sum() const {
266     return result_;
267   }
ops_per_evalDDOTKernel268   size_t ops_per_eval() const {
269     return n_ * 2;
270   }
271 
272   mutable Real result_;
273   const Real* x1_;
274   const Real* x2_;
275   size_t n_;
276 
277 };
278 
279 struct DGEMMKernel : public VectorOpKernel<double> {
280   typedef double ResultType;
281   typedef double Real;
DGEMMKernelDGEMMKernel282   DGEMMKernel(size_t n,
283               Real T,
284               std::string label) : VectorOpKernel<Real>(n*n, 2, T, label), m_(n), n_(n), k_(n) {
285     c_ = &VectorOpKernel<Real>::result_[0];
286     a_ = VectorOpKernel<Real>::args_[0];
287     b_ = VectorOpKernel<Real>::args_[1];
288   }
DGEMMKernelDGEMMKernel289   DGEMMKernel(size_t m, size_t n, size_t k, Real T, std::string label)
290       : VectorOpKernel<Real>({m * n, m * k, k * n}, T, label),
291         m_(m),
292         n_(n),
293         k_(k) {
294     c_ = &VectorOpKernel<Real>::result_[0];
295     a_ = VectorOpKernel<Real>::args_[0];
296     b_ = VectorOpKernel<Real>::args_[1];
297   }
evalDGEMMKernel298   void eval() const {
299     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
300                 1.0, a_, k_,
301                 b_, n_, 1.0, c_, n_);
302   }
sumDGEMMKernel303   Real sum() const {
304     return std::accumulate(c_, c_+m_*n_, 0.0);
305   }
306   // approximate
ops_per_evalDGEMMKernel307   size_t ops_per_eval() const {
308     return m_ * n_ * k_ * 2;
309   }
310 
311   Real* c_;  // m by n
312   const Real* a_;  // m by k
313   const Real* b_;  // k by n
314   size_t m_;
315   size_t n_;
316   size_t k_;
317 };
318 
319 const double stg_zeta = 1.0;
320 
main(int argc,char * argv[])321 int main(int argc, char* argv[]) {
322   if (argc < 2 or argc > 4) {
323     std::cout << "Description: profiles secondary compute-intensive kernels (Boys, daxpy, etc.)" << std::endl;
324     std::cout << "Usage: profile mmax T nrepeats" << std::endl;
325     return 1;
326   }
327   const int mmax  = atoi(argv[1]);
328   const double T  = atol(argv[2]);
329   const double rho = 1.0;
330   const int nrepeats  = atoi(argv[3]);
331 
332   using libint2::simd::VectorSSEDouble;
333   const VectorSSEDouble T_sse(T);
334 #if defined(__AVX__)
335   using libint2::simd::VectorAVXDouble;
336   const VectorAVXDouble T_avx(T);
337 #endif
338 
339   cout << "mmax = " << mmax << endl;
340   cout << " T   = "<< T << endl;
341 
342   // changes precision of cout to 15 for printing doubles and such.
343   cout.precision(15);
344   // set the overhead of std::high_resolution_clock; measure carefully
345   timer.set_now_overhead(25);
346 
347 #ifndef SKIP_AXPY
348   const int n = 128;
349   profile(DAXPYKernel(n, 1.0, 1.0, "daxpy"), nrepeats);
350   profile(AXPYKernel<double>(n, 1.0, 1.0, "axpy [double]"), nrepeats);
351   profile(AXPYKernel<VectorSSEDouble>(n, 1.0, 1.0, "axpy [SSE]"), nrepeats);
352 #  if defined(__AVX__)
353   profile(AXPYKernel<VectorAVXDouble>(n, 1.0, 1.0, "axpy [AVX]"), nrepeats);
354 #  endif
355 #endif
356 
357 #ifndef SKIP_DOT
358   //const int n = 4096;
359   profile(DDOTKernel(n, 1.0, "ddot"), nrepeats);
360   profile(DOTKernel<double>(n, 1.0, "dot [double]"), nrepeats);
361   profile(DOTKernel<VectorSSEDouble>(n, 1.0, "dot [SSE]"), nrepeats);
362 # if defined(__AVX__)
363   profile(DOTKernel<VectorAVXDouble>(n, 1.0, "dot [AVX]"), nrepeats);
364 # endif
365 #endif
366 
367 #ifndef SKIP_GEMM
368   const int nn = 2048;
369   profile(DGEMMKernel(nn, 1.0, "dgemm (2048,2048) x (2048,2048)"), 1);
370 #endif
371 
372 #ifndef SKIP_EXP
373   profile(BasicKernel<double,std::exp>(-T,"exp(-T) [double]", -0.00001), nrepeats);
374   profile(BasicKernel<VectorSSEDouble,libint2::simd::exp>(-T,"exp(-T) [SSE]", -0.00001), nrepeats);
375 # if defined(__AVX__)
376   profile(BasicKernel<VectorAVXDouble,libint2::simd::exp>(-T,"exp(-T) [AVX]", -0.00001), nrepeats);
377 # endif
378 #endif
379 #ifndef SKIP_SQRT
380   profile(BasicKernel<double,std::sqrt>(T,"sqrt(T) [double]"), nrepeats);
381   profile(BasicKernel<VectorSSEDouble,libint2::simd::sqrt>(T,"sqrt(T) [SSE]"), nrepeats);
382 # if defined(__AVX__)
383   profile(BasicKernel<VectorAVXDouble,libint2::simd::sqrt>(T,"sqrt(T) [AVX]"), nrepeats);
384 # endif
385 #endif
386 #ifndef SKIP_ERF
387   profile(BasicKernel<double,erf>(T,"erf(T) [double]"), nrepeats);
388   profile(BasicKernel<VectorSSEDouble,libint2::simd::erf>(T,"erf(T) [SSE]"), nrepeats);
389 # if defined(__AVX__)
390   profile(BasicKernel<VectorAVXDouble,libint2::simd::erf>(T,"erf(T) [AVX]"), nrepeats);
391 # endif
392   profile(BasicKernel<double,erfc>(T,"erfc(T) [double]"), nrepeats);
393   profile(BasicKernel<VectorSSEDouble,libint2::simd::erfc>(T,"erfc(T) [SSE]"), nrepeats);
394 # if defined(__AVX__)
395   profile(BasicKernel<VectorAVXDouble,libint2::simd::erfc>(T,"erfc(T) [AVX]"), nrepeats);
396 # endif
397 #endif
398 #ifndef SKIP_CHEBYSHEV
399   do_chebyshev<7>(mmax, nrepeats);
400 #endif
401 #ifndef SKIP_TAYLOR
402 //  do_taylor<3>(mmax, nrepeats);
403   do_taylor<7>(mmax, nrepeats);
404 #endif
405 #ifndef SKIP_STGNG
406   do_stg6g<f12>(mmax, T, rho, nrepeats);
407   do_stg6g<f12_o_r12>(mmax, T, rho, nrepeats);
408   do_stg6g<f12_2>(mmax, T, rho, nrepeats);
409   do_stg6g<f12_t_f12>(mmax, T, rho, nrepeats);
410 #endif
411 #ifndef SKIP_YUKAWA
412   do_stg<true>(mmax, T, rho, nrepeats);
413   do_stg<false>(mmax, T, rho, nrepeats);
414 #endif
415   return 0;
416 }
417 
418 template <typename Kernel>
profile(const Kernel & k,int nrepeats)419 void profile(const Kernel& k, int nrepeats) {
420   std::cout << "===================== " << k.label() << " ======================" << std::endl;
421 
422   typedef typename Kernel::ResultType Real;
423 
424   timer.clear();
425   timer.start(0);
426 
427 #if AVOID_AUTO_VECTORIZATION
428 #pragma novector
429 #endif
430   for (int i = 0; i < nrepeats; ++i) {
431     k.eval();
432   }
433   timer.stop(0);
434 
435   std::cout << "sum of " << k.label() << ": " << k.sum() << std::endl;
436 
437   cout << "Time = " << fixed << timer.read(0) << endl;
438   cout << "Rate = " << fixed << nrepeats * k.ops_per_eval() / timer.read(0) << endl;
439 }
440 
441 template <unsigned InterpolationOrder>
do_chebyshev(int mmax,int nrepeats)442 void do_chebyshev(int mmax, int nrepeats) {
443   std::cout << "===================== Fm Cheb" << InterpolationOrder << " ======================" << std::endl;
444   double* Fm_array = new double[mmax+1];
445   double* Fm_array_sum = new double[mmax+1];
446   //std::cout << "alignment of Fm = " << reinterpret_cast<unsigned long int>((char*) Fm_array) % 32ul << " bytes\n";
447 
448   // initialize dice
449   dice::init();
450   const double T_max = 30.0; // values >= T_max are handled by recursion
451   const double scale_unit32_to_T = T_max / std::numeric_limits<uint32_t>::max();
452 
453   if (InterpolationOrder != 7) abort();
454   typedef libint2::FmEval_Chebyshev7<> fmeval_t;
455   fmeval_t fmeval_cheb(mmax);
456   std::cout << "done initialization:" << std::endl;
457   std::fill(Fm_array_sum, Fm_array_sum+mmax+1, 0.0);
458   timer.clear();
459   timer.start(0);
460 #if AVOID_AUTO_VECTORIZATION
461 #pragma novector
462 #endif
463   for (int i = 0; i < nrepeats; ++i) {
464     const double T = dice::random_unit32() * scale_unit32_to_T;
465     // this computes all Fm for up to mmax
466     fmeval_cheb.eval(Fm_array, T, mmax);
467     for(int m=0; m<=mmax; ++m)
468       Fm_array_sum[m] += Fm_array[m];
469   }
470   timer.stop(0);
471 
472   std::cout << "sum of Fm:" << std::endl;
473   std::copy(Fm_array_sum, Fm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
474 
475   cout << "Time = " << fixed << timer.read(0) << endl;
476   cout << "Rate = " << fixed << nrepeats / timer.read(0)  << endl;
477 
478   delete[] Fm_array;
479   delete[] Fm_array_sum;
480 }
481 
482 template <unsigned InterpolationOrder>
do_taylor(int mmax,int nrepeats)483 void do_taylor(int mmax, int nrepeats) {
484   std::cout << "===================== Fm Taylor" << InterpolationOrder << " ======================" << std::endl;
485   double* Fm_array = new double[mmax+1];
486   double* Fm_array_sum = new double[mmax+1];
487 
488   // initialize dice
489   dice::init();
490   const double T_max = 30.0; // values >= T_max are handled by recursion
491   const double scale_unit32_to_T = T_max / std::numeric_limits<uint32_t>::max();
492 
493   libint2::FmEval_Taylor<double, InterpolationOrder> fmeval(mmax);
494   std::fill(Fm_array_sum, Fm_array_sum+mmax+1, 0.0);
495   timer.clear();
496   timer.start(0);
497 #if AVOID_AUTO_VECTORIZATION
498 #pragma novector
499 #endif
500   for (int i = 0; i < nrepeats; ++i)
501   {
502     const double T = dice::random_unit32() * scale_unit32_to_T;
503     // this computes all Fm for up to mmax
504     fmeval.eval(Fm_array, T, mmax);
505     for(int m=0; m<=mmax; ++m)
506       Fm_array_sum[m] += Fm_array[m];
507   }
508   timer.stop(0);
509   std::cout << "sum of Fm (Taylor):" << std::endl;
510   std::copy(Fm_array_sum, Fm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
511 
512   cout << "Time = " << fixed << timer.read(0) << endl;
513   cout << "Rate = " << fixed << nrepeats / timer.read(0)  << endl;
514 
515   delete[] Fm_array;
516   delete[] Fm_array_sum;
517 }
518 
519 template<OperType O>
do_stg6g(int mmax,double T,double rho,int nrepeats)520 void do_stg6g(int mmax, double T, double rho, int nrepeats) {
521   std::cout << "===================== Gm STG-6G ======================" << std::endl;
522   double* Gm_array = new double[mmax+1];
523   double* Gm_array_sum = new double[mmax+1];
524 
525   const size_t ng = 6;
526   std::vector< std::pair<double,double> > stg_ng(ng);
527   //stg_ng[0] = make_pair(4.0001, 1.0);
528 #if 1
529 #if HAVE_LAPACK
530   libint2::stg_ng_fit(ng, stg_zeta, stg_ng);
531 #else
532   if (stg_zeta != 1.0) {
533     throw std::runtime_error("without lapack stg_zeta is hardwired to 0.1");
534   }
535   stg_ng[0] = make_pair(0.16015391600067220691727771683865433704907890673261,
536                         0.20306992259915090794062652264516576964313257462623);
537   stg_ng[1] = make_pair(0.58691138376032812074703122125162923674902800850316,
538                         0.29474840080158909154305767626309566520662550324323);
539   stg_ng[2] = make_pair(1.9052880179050650706871766123674791603725239722860,
540                         0.20652315861651088693388092033220845569017370830588);
541   stg_ng[3] = make_pair(6.1508186412033182882412135545092215700186355770734,
542                         0.13232619560602867340217449542493153700747744735317);
543   stg_ng[4] = make_pair(22.558816746266648614747394893787336699960416307706,
544                         0.084097701098685716800769730376212853566993914234229);
545   stg_ng[5] = make_pair(167.12355778570626548864380047361110482628234458031,
546                         0.079234606133959413896805606690618531996594605785539);
547 #endif
548 #endif
549 
550   std::vector< std::pair<double,double> > stg_ng_sq(stg_ng.size() * stg_ng.size());
551   for(int b=0, bk=0; b<stg_ng.size(); ++b) {
552     for(int k=0; k<stg_ng.size(); ++k, ++bk) {
553       const double exp = stg_ng[b].first  + stg_ng[k].first;
554       const double coef = stg_ng[b].second * stg_ng[k].second * (O == f12_t_f12 ? 4.0 * stg_ng[b].first  * stg_ng[k].first : 1.0);
555       stg_ng_sq[bk] = make_pair(exp, coef);
556     }
557   }
558 
559   libint2::GaussianGmEval<double, (O == f12 || O == f12_2) ? 0 : (O == f12_o_r12 ? -1 : 2)>
560     gtg_eval(mmax, 1e-15);
561   std::fill(Gm_array_sum, Gm_array_sum+mmax+1, 0.0);
562   timer.clear();
563   timer.start(0);
564 #if AVOID_AUTO_VECTORIZATION
565 #pragma novector
566 #endif
567   for (int i = 0; i < nrepeats; ++i)
568   {
569     // this computes all Gm for up to mmax
570     gtg_eval.eval(Gm_array, rho, T, mmax,
571                   (O == f12 || O == f12_o_r12) ? stg_ng : stg_ng_sq);
572     for(int m=0; m<=mmax; ++m)
573       Gm_array_sum[m] += Gm_array[m];
574     T += 0.00001; // to ward-off unrealistic compiler optimizations
575     rho += 0.000001;
576   }
577   timer.stop(0);
578 
579   std::cout << "sum of Gm (STG-" << ng << "G O=" << to_string(O) << " ):" << std::endl;
580   std::copy(Gm_array_sum, Gm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
581 
582   cout << "Time = " << fixed << timer.read(0) << endl;
583   cout << "Rate = " << fixed << nrepeats / timer.read(0)  << endl;
584 
585   delete[] Gm_array;
586   delete[] Gm_array_sum;
587 }
588 
589 template <bool exp>
do_stg(int mmax,double T,double rho,int nrepeats)590 void do_stg(int mmax, double T, double rho, int nrepeats) {
591   const std::string label = (exp ? "STG" : "Yukawa");
592   std::cout << "===================== Gm " << label << " ======================" << std::endl;
593   double* Gm_array = new double[mmax+1];
594   double* Gm_array_sum = new double[mmax+1];
595   const auto one_over_rho = 1./rho;
596 
597   libint2::TennoGmEval<double> tenno_eval(mmax, 1e-15);
598   std::fill(Gm_array_sum, Gm_array_sum+mmax+1, 0.0);
599   timer.clear();
600   timer.start(0);
601 
602 #if AVOID_AUTO_VECTORIZATION
603 #pragma novector
604 #endif
605   for (int i = 0; i < nrepeats; ++i)
606   {
607     // this computes all Gm for up to mmax
608 //    asm("#tag1");
609     if (exp)
610       tenno_eval.eval_slater(Gm_array, one_over_rho, T, mmax, stg_zeta);
611     else
612       tenno_eval.eval_yukawa(Gm_array, one_over_rho, T, mmax, stg_zeta);
613     for(int m=0; m<=mmax; ++m)
614       Gm_array_sum[m] += Gm_array[m];
615 
616     T += 0.00001; // to ward-off unrealistic compiler optimizations
617   }
618   timer.stop(0);
619 
620   std::cout << "sum of Gm (" << label << "):" << std::endl;
621   std::copy(Gm_array_sum, Gm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
622 
623   cout << "Time = " << fixed << timer.read(0) << endl;
624   cout << "Rate = " << fixed << nrepeats / timer.read(0)  << endl;
625 
626   delete[] Gm_array;
627   delete[] Gm_array_sum;
628 }
629