1 /*
2 * Copyright (C) 2004-2021 Edward F. Valeev
3 *
4 * This file is part of Libint.
5 *
6 * Libint is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Libint is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Libint. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21 #include "libint2/util/generated/libint2_params.h"
22 #ifndef LIBINT2_REALTYPE
23 #define LIBINT2_REALTYPE double
24 #endif
25 #include "libint2/boys.h"
26 #include "libint2/util/timer.h"
27 #include <ctime>
28 #include <cstdint>
29 #include <cstdlib>
30 #include <iostream>
31 #include <initializer_list>
32 #include <iterator>
33 #include <functional>
34 #include <numeric>
35 #include <cxxabi.h>
36
37 #ifndef SKIP_AXPY
38 # include <mkl_cblas.h>
39 typedef MKL_INT BLAS_INT;
40 #endif
41
42 #ifdef SKIP_ALL
43 # define SKIP_AXPY
44 # define SKIP_DOT
45 # define SKIP_GEMM
46 # define SKIP_EXP
47 # define SKIP_SQRT
48 # define SKIP_ERF
49 # define SKIP_CHEBYSHEV
50 # define SKIP_TAYLOR
51 # define SKIP_STGNG
52 # define SKIP_YUKAWA
53 #endif
54
55 #define AVOID_AUTO_VECTORIZATION 1 // set to 0 to measure performance within vectorizable loops
56 // the default is to measure performance of a single call
57
58 using namespace std;
59
60 libint2::Timers<1> timer;
61
62 enum OperType {
63 f12, f12_2, f12_o_r12, f12_t_f12
64 };
to_string(OperType o)65 std::string to_string(OperType o) {
66 std::string result;
67 switch (o) {
68 case f12: result = "f12"; break;
69 case f12_2: result = "f12_2"; break;
70 case f12_o_r12: result = "f12_o_r12"; break;
71 case f12_t_f12: result = "f12_t_f12"; break;
72 }
73 return result;
74 }
75
76 /* These state variables must be initialized so that they are not all zero. */
77 namespace dice {
78 uint32_t x, y, z, w;
init(unsigned int seed=159265359)79 void init(unsigned int seed = 159265359) {
80 srand(seed);
81 x=rand();
82 y=rand();
83 z=rand();
84 w=rand();
85 }
86 // uses xorshift128 algorithm, see http://en.wikipedia.org/wiki/Xorshift
random_unit32(void)87 uint32_t random_unit32(void) {
88 uint32_t t = x ^ (x << 11);
89 x = y; y = z; z = w;
90 return w = w ^ (w >> 19) ^ t ^ (t >> 8);
91 }
92 };
93
94 /** profiles Kernel by running it repeatedly, and reporting the sum of values
95 *
96 * @param k the Kernel, needs to have 2 member functions: label and and eval, neither of which takes a parameter
97 * @param nrepeats the number of times the kernel is run
98 */
99 template <typename Kernel> void profile(const Kernel& k, int nrepeats);
100
101 template <unsigned InterpolationOrder = 7>
102 void do_chebyshev(int mmax, int nrepeats);
103 template <unsigned InterpolationOrder = 7>
104 void do_taylor(int mmax, int nrepeats);
105 template <OperType O> void do_stg6g(int mmax, double T, double rho, int nrepeats);
106 template <bool exp = false>
107 void do_stg(int mmax, double T, double U, int nrepeats);
108
109 template <typename Real, Real (Function)(Real) >
110 struct BasicKernel {
111 typedef Real ResultType;
BasicKernelBasicKernel112 BasicKernel(Real T, std::string label, double T_dec = 0.00001) : label_(label),
113 T_(T), T_dec_(T_dec), sum_(0) {}
labelBasicKernel114 std::string label() const { return label_; }
evalBasicKernel115 void eval() const {
116 sum_ += Function(T_);
117 T_ += T_dec_;
118 }
sumBasicKernel119 Real sum() const {
120 return sum_;
121 }
ops_per_evalBasicKernel122 size_t ops_per_eval() const {
123 return 1;
124 }
125 std::string label_;
126 mutable Real T_;
127 Real T_dec_;
128 mutable Real sum_;
129 };
130
131 template <typename Real>
132 struct VectorOpKernel {
VectorOpKernelVectorOpKernel133 VectorOpKernel(size_t veclen,
134 size_t nargs,
135 Real T,
136 std::string label) : result_(veclen, Real(0)), args_(nargs), label_(label) {
137 for(size_t i=0; i<args_.size(); ++i) {
138 args_[i] = new Real[veclen];
139 std::fill(args_[i], args_[i] + veclen, T);
140 }
141 }
VectorOpKernelVectorOpKernel142 VectorOpKernel(std::initializer_list<size_t> sizes, Real T,
143 std::string label)
144 : result_(*sizes.begin(), Real(0)), args_(sizes.size() - 1), label_(label) {
145 auto size_iter = sizes.begin() + 1;
146 for (size_t i = 0; i < args_.size(); ++i, ++size_iter) {
147 args_[i] = new Real[*size_iter];
148 std::fill(args_[i], args_[i] + *size_iter, T);
149 }
150 }
~VectorOpKernelVectorOpKernel151 ~VectorOpKernel() {
152 for(size_t i=0; i<args_.size(); ++i) {
153 delete[] args_[i];
154 }
155 }
labelVectorOpKernel156 std::string label() const {
157 return label_;
158 }
159
160 std::vector<Real> result_;
161 std::vector<Real*> args_;
162 std::string label_;
163 };
164
165 template <typename Real>
166 struct AXPYKernel : public VectorOpKernel<Real> {
167 typedef Real ResultType;
AXPYKernelAXPYKernel168 AXPYKernel(size_t veclen,
169 Real a,
170 Real T,
171 std::string label) : VectorOpKernel<Real>(veclen, 1, T, label), a_(a) {
172 y_ = &VectorOpKernel<Real>::result_[0];
173 x_ = VectorOpKernel<Real>::args_[0];
174 n_ = VectorOpKernel<Real>::result_.size();
175 }
evalAXPYKernel176 void eval() const {
177 #pragma ivdep
178 for(size_t v=0; v<n_; ++v)
179 y_[v] += a_ * x_[v];
180 }
sumAXPYKernel181 Real sum() const {
182 return std::accumulate(y_, y_+n_, Real(0));
183 }
ops_per_evalAXPYKernel184 size_t ops_per_eval() const {
185 return n_ * 2;
186 }
187
188 Real a_;
189 Real* y_;
190 Real const* x_;
191 BLAS_INT n_;
192
193 };
194
195 struct DAXPYKernel : public VectorOpKernel<double> {
196 typedef double ResultType;
197 typedef double Real;
DAXPYKernelDAXPYKernel198 DAXPYKernel(size_t veclen,
199 Real a,
200 Real T,
201 std::string label) : VectorOpKernel<Real>(veclen, 1, T, label), a_(a) {
202 y_ = &VectorOpKernel<Real>::result_[0];
203 x_ = VectorOpKernel<Real>::args_[0];
204 n_ = VectorOpKernel<Real>::result_.size();
205 }
evalDAXPYKernel206 void eval() const {
207 cblas_daxpy(n_, a_, x_, 1, y_, 1);
208 }
sumDAXPYKernel209 Real sum() const {
210 return std::accumulate(y_, y_+n_, Real(0));
211 }
ops_per_evalDAXPYKernel212 size_t ops_per_eval() const {
213 return n_ * 2;
214 }
215
216 Real a_;
217 Real* y_;
218 Real* x_;
219 size_t n_;
220
221 };
222
223 template <typename Real>
224 struct DOTKernel : public VectorOpKernel<Real> {
225 typedef Real ResultType;
DOTKernelDOTKernel226 DOTKernel(size_t veclen,
227 Real T,
228 std::string label) : VectorOpKernel<Real>(veclen, 2, T, label), result_(0) {
229 x1_ = VectorOpKernel<Real>::args_[0];
230 x2_ = VectorOpKernel<Real>::args_[1];
231 n_ = VectorOpKernel<Real>::result_.size();
232 }
evalDOTKernel233 void eval() const {
234 #pragma ivdep
235 for(size_t v=0; v<n_; ++v)
236 result_ += x1_[v] * x2_[v];
237 }
sumDOTKernel238 Real sum() const {
239 return result_;
240 }
ops_per_evalDOTKernel241 size_t ops_per_eval() const {
242 return n_ * 2;
243 }
244
245 mutable Real result_;
246 Real const* x1_;
247 Real const* x2_;
248 BLAS_INT n_;
249
250 };
251
252 struct DDOTKernel : public VectorOpKernel<double> {
253 typedef double ResultType;
254 typedef double Real;
DDOTKernelDDOTKernel255 DDOTKernel(size_t veclen,
256 Real T,
257 std::string label) : VectorOpKernel<Real>(veclen, 2, T, label), result_(0) {
258 x1_ = VectorOpKernel<Real>::args_[0];
259 x2_ = VectorOpKernel<Real>::args_[1];
260 n_ = VectorOpKernel<Real>::result_.size();
261 }
evalDDOTKernel262 void eval() const {
263 result_ += cblas_ddot(n_, x1_, 1, x2_, 1);
264 }
sumDDOTKernel265 Real sum() const {
266 return result_;
267 }
ops_per_evalDDOTKernel268 size_t ops_per_eval() const {
269 return n_ * 2;
270 }
271
272 mutable Real result_;
273 const Real* x1_;
274 const Real* x2_;
275 size_t n_;
276
277 };
278
279 struct DGEMMKernel : public VectorOpKernel<double> {
280 typedef double ResultType;
281 typedef double Real;
DGEMMKernelDGEMMKernel282 DGEMMKernel(size_t n,
283 Real T,
284 std::string label) : VectorOpKernel<Real>(n*n, 2, T, label), m_(n), n_(n), k_(n) {
285 c_ = &VectorOpKernel<Real>::result_[0];
286 a_ = VectorOpKernel<Real>::args_[0];
287 b_ = VectorOpKernel<Real>::args_[1];
288 }
DGEMMKernelDGEMMKernel289 DGEMMKernel(size_t m, size_t n, size_t k, Real T, std::string label)
290 : VectorOpKernel<Real>({m * n, m * k, k * n}, T, label),
291 m_(m),
292 n_(n),
293 k_(k) {
294 c_ = &VectorOpKernel<Real>::result_[0];
295 a_ = VectorOpKernel<Real>::args_[0];
296 b_ = VectorOpKernel<Real>::args_[1];
297 }
evalDGEMMKernel298 void eval() const {
299 cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
300 1.0, a_, k_,
301 b_, n_, 1.0, c_, n_);
302 }
sumDGEMMKernel303 Real sum() const {
304 return std::accumulate(c_, c_+m_*n_, 0.0);
305 }
306 // approximate
ops_per_evalDGEMMKernel307 size_t ops_per_eval() const {
308 return m_ * n_ * k_ * 2;
309 }
310
311 Real* c_; // m by n
312 const Real* a_; // m by k
313 const Real* b_; // k by n
314 size_t m_;
315 size_t n_;
316 size_t k_;
317 };
318
319 const double stg_zeta = 1.0;
320
main(int argc,char * argv[])321 int main(int argc, char* argv[]) {
322 if (argc < 2 or argc > 4) {
323 std::cout << "Description: profiles secondary compute-intensive kernels (Boys, daxpy, etc.)" << std::endl;
324 std::cout << "Usage: profile mmax T nrepeats" << std::endl;
325 return 1;
326 }
327 const int mmax = atoi(argv[1]);
328 const double T = atol(argv[2]);
329 const double rho = 1.0;
330 const int nrepeats = atoi(argv[3]);
331
332 using libint2::simd::VectorSSEDouble;
333 const VectorSSEDouble T_sse(T);
334 #if defined(__AVX__)
335 using libint2::simd::VectorAVXDouble;
336 const VectorAVXDouble T_avx(T);
337 #endif
338
339 cout << "mmax = " << mmax << endl;
340 cout << " T = "<< T << endl;
341
342 // changes precision of cout to 15 for printing doubles and such.
343 cout.precision(15);
344 // set the overhead of std::high_resolution_clock; measure carefully
345 timer.set_now_overhead(25);
346
347 #ifndef SKIP_AXPY
348 const int n = 128;
349 profile(DAXPYKernel(n, 1.0, 1.0, "daxpy"), nrepeats);
350 profile(AXPYKernel<double>(n, 1.0, 1.0, "axpy [double]"), nrepeats);
351 profile(AXPYKernel<VectorSSEDouble>(n, 1.0, 1.0, "axpy [SSE]"), nrepeats);
352 # if defined(__AVX__)
353 profile(AXPYKernel<VectorAVXDouble>(n, 1.0, 1.0, "axpy [AVX]"), nrepeats);
354 # endif
355 #endif
356
357 #ifndef SKIP_DOT
358 //const int n = 4096;
359 profile(DDOTKernel(n, 1.0, "ddot"), nrepeats);
360 profile(DOTKernel<double>(n, 1.0, "dot [double]"), nrepeats);
361 profile(DOTKernel<VectorSSEDouble>(n, 1.0, "dot [SSE]"), nrepeats);
362 # if defined(__AVX__)
363 profile(DOTKernel<VectorAVXDouble>(n, 1.0, "dot [AVX]"), nrepeats);
364 # endif
365 #endif
366
367 #ifndef SKIP_GEMM
368 const int nn = 2048;
369 profile(DGEMMKernel(nn, 1.0, "dgemm (2048,2048) x (2048,2048)"), 1);
370 #endif
371
372 #ifndef SKIP_EXP
373 profile(BasicKernel<double,std::exp>(-T,"exp(-T) [double]", -0.00001), nrepeats);
374 profile(BasicKernel<VectorSSEDouble,libint2::simd::exp>(-T,"exp(-T) [SSE]", -0.00001), nrepeats);
375 # if defined(__AVX__)
376 profile(BasicKernel<VectorAVXDouble,libint2::simd::exp>(-T,"exp(-T) [AVX]", -0.00001), nrepeats);
377 # endif
378 #endif
379 #ifndef SKIP_SQRT
380 profile(BasicKernel<double,std::sqrt>(T,"sqrt(T) [double]"), nrepeats);
381 profile(BasicKernel<VectorSSEDouble,libint2::simd::sqrt>(T,"sqrt(T) [SSE]"), nrepeats);
382 # if defined(__AVX__)
383 profile(BasicKernel<VectorAVXDouble,libint2::simd::sqrt>(T,"sqrt(T) [AVX]"), nrepeats);
384 # endif
385 #endif
386 #ifndef SKIP_ERF
387 profile(BasicKernel<double,erf>(T,"erf(T) [double]"), nrepeats);
388 profile(BasicKernel<VectorSSEDouble,libint2::simd::erf>(T,"erf(T) [SSE]"), nrepeats);
389 # if defined(__AVX__)
390 profile(BasicKernel<VectorAVXDouble,libint2::simd::erf>(T,"erf(T) [AVX]"), nrepeats);
391 # endif
392 profile(BasicKernel<double,erfc>(T,"erfc(T) [double]"), nrepeats);
393 profile(BasicKernel<VectorSSEDouble,libint2::simd::erfc>(T,"erfc(T) [SSE]"), nrepeats);
394 # if defined(__AVX__)
395 profile(BasicKernel<VectorAVXDouble,libint2::simd::erfc>(T,"erfc(T) [AVX]"), nrepeats);
396 # endif
397 #endif
398 #ifndef SKIP_CHEBYSHEV
399 do_chebyshev<7>(mmax, nrepeats);
400 #endif
401 #ifndef SKIP_TAYLOR
402 // do_taylor<3>(mmax, nrepeats);
403 do_taylor<7>(mmax, nrepeats);
404 #endif
405 #ifndef SKIP_STGNG
406 do_stg6g<f12>(mmax, T, rho, nrepeats);
407 do_stg6g<f12_o_r12>(mmax, T, rho, nrepeats);
408 do_stg6g<f12_2>(mmax, T, rho, nrepeats);
409 do_stg6g<f12_t_f12>(mmax, T, rho, nrepeats);
410 #endif
411 #ifndef SKIP_YUKAWA
412 do_stg<true>(mmax, T, rho, nrepeats);
413 do_stg<false>(mmax, T, rho, nrepeats);
414 #endif
415 return 0;
416 }
417
418 template <typename Kernel>
profile(const Kernel & k,int nrepeats)419 void profile(const Kernel& k, int nrepeats) {
420 std::cout << "===================== " << k.label() << " ======================" << std::endl;
421
422 typedef typename Kernel::ResultType Real;
423
424 timer.clear();
425 timer.start(0);
426
427 #if AVOID_AUTO_VECTORIZATION
428 #pragma novector
429 #endif
430 for (int i = 0; i < nrepeats; ++i) {
431 k.eval();
432 }
433 timer.stop(0);
434
435 std::cout << "sum of " << k.label() << ": " << k.sum() << std::endl;
436
437 cout << "Time = " << fixed << timer.read(0) << endl;
438 cout << "Rate = " << fixed << nrepeats * k.ops_per_eval() / timer.read(0) << endl;
439 }
440
441 template <unsigned InterpolationOrder>
do_chebyshev(int mmax,int nrepeats)442 void do_chebyshev(int mmax, int nrepeats) {
443 std::cout << "===================== Fm Cheb" << InterpolationOrder << " ======================" << std::endl;
444 double* Fm_array = new double[mmax+1];
445 double* Fm_array_sum = new double[mmax+1];
446 //std::cout << "alignment of Fm = " << reinterpret_cast<unsigned long int>((char*) Fm_array) % 32ul << " bytes\n";
447
448 // initialize dice
449 dice::init();
450 const double T_max = 30.0; // values >= T_max are handled by recursion
451 const double scale_unit32_to_T = T_max / std::numeric_limits<uint32_t>::max();
452
453 if (InterpolationOrder != 7) abort();
454 typedef libint2::FmEval_Chebyshev7<> fmeval_t;
455 fmeval_t fmeval_cheb(mmax);
456 std::cout << "done initialization:" << std::endl;
457 std::fill(Fm_array_sum, Fm_array_sum+mmax+1, 0.0);
458 timer.clear();
459 timer.start(0);
460 #if AVOID_AUTO_VECTORIZATION
461 #pragma novector
462 #endif
463 for (int i = 0; i < nrepeats; ++i) {
464 const double T = dice::random_unit32() * scale_unit32_to_T;
465 // this computes all Fm for up to mmax
466 fmeval_cheb.eval(Fm_array, T, mmax);
467 for(int m=0; m<=mmax; ++m)
468 Fm_array_sum[m] += Fm_array[m];
469 }
470 timer.stop(0);
471
472 std::cout << "sum of Fm:" << std::endl;
473 std::copy(Fm_array_sum, Fm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
474
475 cout << "Time = " << fixed << timer.read(0) << endl;
476 cout << "Rate = " << fixed << nrepeats / timer.read(0) << endl;
477
478 delete[] Fm_array;
479 delete[] Fm_array_sum;
480 }
481
482 template <unsigned InterpolationOrder>
do_taylor(int mmax,int nrepeats)483 void do_taylor(int mmax, int nrepeats) {
484 std::cout << "===================== Fm Taylor" << InterpolationOrder << " ======================" << std::endl;
485 double* Fm_array = new double[mmax+1];
486 double* Fm_array_sum = new double[mmax+1];
487
488 // initialize dice
489 dice::init();
490 const double T_max = 30.0; // values >= T_max are handled by recursion
491 const double scale_unit32_to_T = T_max / std::numeric_limits<uint32_t>::max();
492
493 libint2::FmEval_Taylor<double, InterpolationOrder> fmeval(mmax);
494 std::fill(Fm_array_sum, Fm_array_sum+mmax+1, 0.0);
495 timer.clear();
496 timer.start(0);
497 #if AVOID_AUTO_VECTORIZATION
498 #pragma novector
499 #endif
500 for (int i = 0; i < nrepeats; ++i)
501 {
502 const double T = dice::random_unit32() * scale_unit32_to_T;
503 // this computes all Fm for up to mmax
504 fmeval.eval(Fm_array, T, mmax);
505 for(int m=0; m<=mmax; ++m)
506 Fm_array_sum[m] += Fm_array[m];
507 }
508 timer.stop(0);
509 std::cout << "sum of Fm (Taylor):" << std::endl;
510 std::copy(Fm_array_sum, Fm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
511
512 cout << "Time = " << fixed << timer.read(0) << endl;
513 cout << "Rate = " << fixed << nrepeats / timer.read(0) << endl;
514
515 delete[] Fm_array;
516 delete[] Fm_array_sum;
517 }
518
519 template<OperType O>
do_stg6g(int mmax,double T,double rho,int nrepeats)520 void do_stg6g(int mmax, double T, double rho, int nrepeats) {
521 std::cout << "===================== Gm STG-6G ======================" << std::endl;
522 double* Gm_array = new double[mmax+1];
523 double* Gm_array_sum = new double[mmax+1];
524
525 const size_t ng = 6;
526 std::vector< std::pair<double,double> > stg_ng(ng);
527 //stg_ng[0] = make_pair(4.0001, 1.0);
528 #if 1
529 #if HAVE_LAPACK
530 libint2::stg_ng_fit(ng, stg_zeta, stg_ng);
531 #else
532 if (stg_zeta != 1.0) {
533 throw std::runtime_error("without lapack stg_zeta is hardwired to 0.1");
534 }
535 stg_ng[0] = make_pair(0.16015391600067220691727771683865433704907890673261,
536 0.20306992259915090794062652264516576964313257462623);
537 stg_ng[1] = make_pair(0.58691138376032812074703122125162923674902800850316,
538 0.29474840080158909154305767626309566520662550324323);
539 stg_ng[2] = make_pair(1.9052880179050650706871766123674791603725239722860,
540 0.20652315861651088693388092033220845569017370830588);
541 stg_ng[3] = make_pair(6.1508186412033182882412135545092215700186355770734,
542 0.13232619560602867340217449542493153700747744735317);
543 stg_ng[4] = make_pair(22.558816746266648614747394893787336699960416307706,
544 0.084097701098685716800769730376212853566993914234229);
545 stg_ng[5] = make_pair(167.12355778570626548864380047361110482628234458031,
546 0.079234606133959413896805606690618531996594605785539);
547 #endif
548 #endif
549
550 std::vector< std::pair<double,double> > stg_ng_sq(stg_ng.size() * stg_ng.size());
551 for(int b=0, bk=0; b<stg_ng.size(); ++b) {
552 for(int k=0; k<stg_ng.size(); ++k, ++bk) {
553 const double exp = stg_ng[b].first + stg_ng[k].first;
554 const double coef = stg_ng[b].second * stg_ng[k].second * (O == f12_t_f12 ? 4.0 * stg_ng[b].first * stg_ng[k].first : 1.0);
555 stg_ng_sq[bk] = make_pair(exp, coef);
556 }
557 }
558
559 libint2::GaussianGmEval<double, (O == f12 || O == f12_2) ? 0 : (O == f12_o_r12 ? -1 : 2)>
560 gtg_eval(mmax, 1e-15);
561 std::fill(Gm_array_sum, Gm_array_sum+mmax+1, 0.0);
562 timer.clear();
563 timer.start(0);
564 #if AVOID_AUTO_VECTORIZATION
565 #pragma novector
566 #endif
567 for (int i = 0; i < nrepeats; ++i)
568 {
569 // this computes all Gm for up to mmax
570 gtg_eval.eval(Gm_array, rho, T, mmax,
571 (O == f12 || O == f12_o_r12) ? stg_ng : stg_ng_sq);
572 for(int m=0; m<=mmax; ++m)
573 Gm_array_sum[m] += Gm_array[m];
574 T += 0.00001; // to ward-off unrealistic compiler optimizations
575 rho += 0.000001;
576 }
577 timer.stop(0);
578
579 std::cout << "sum of Gm (STG-" << ng << "G O=" << to_string(O) << " ):" << std::endl;
580 std::copy(Gm_array_sum, Gm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
581
582 cout << "Time = " << fixed << timer.read(0) << endl;
583 cout << "Rate = " << fixed << nrepeats / timer.read(0) << endl;
584
585 delete[] Gm_array;
586 delete[] Gm_array_sum;
587 }
588
589 template <bool exp>
do_stg(int mmax,double T,double rho,int nrepeats)590 void do_stg(int mmax, double T, double rho, int nrepeats) {
591 const std::string label = (exp ? "STG" : "Yukawa");
592 std::cout << "===================== Gm " << label << " ======================" << std::endl;
593 double* Gm_array = new double[mmax+1];
594 double* Gm_array_sum = new double[mmax+1];
595 const auto one_over_rho = 1./rho;
596
597 libint2::TennoGmEval<double> tenno_eval(mmax, 1e-15);
598 std::fill(Gm_array_sum, Gm_array_sum+mmax+1, 0.0);
599 timer.clear();
600 timer.start(0);
601
602 #if AVOID_AUTO_VECTORIZATION
603 #pragma novector
604 #endif
605 for (int i = 0; i < nrepeats; ++i)
606 {
607 // this computes all Gm for up to mmax
608 // asm("#tag1");
609 if (exp)
610 tenno_eval.eval_slater(Gm_array, one_over_rho, T, mmax, stg_zeta);
611 else
612 tenno_eval.eval_yukawa(Gm_array, one_over_rho, T, mmax, stg_zeta);
613 for(int m=0; m<=mmax; ++m)
614 Gm_array_sum[m] += Gm_array[m];
615
616 T += 0.00001; // to ward-off unrealistic compiler optimizations
617 }
618 timer.stop(0);
619
620 std::cout << "sum of Gm (" << label << "):" << std::endl;
621 std::copy(Gm_array_sum, Gm_array_sum+mmax+1, std::ostream_iterator<double>(std::cout,"\n"));
622
623 cout << "Time = " << fixed << timer.read(0) << endl;
624 cout << "Rate = " << fixed << nrepeats / timer.read(0) << endl;
625
626 delete[] Gm_array;
627 delete[] Gm_array_sum;
628 }
629