1 /*
2  *  Copyright (C) 2004-2021 Edward F. Valeev
3  *
4  *  This file is part of Libint.
5  *
6  *  Libint is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU Lesser General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  Libint is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU Lesser General Public License for more details.
15  *
16  *  You should have received a copy of the GNU Lesser General Public License
17  *  along with Libint.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 #ifndef _libint2_src_lib_libint_boys_h_
22 #define _libint2_src_lib_libint_boys_h_
23 
24 #if defined(__cplusplus)
25 
26 #include <iostream>
27 #include <cstdlib>
28 #include <cmath>
29 #include <stdexcept>
30 #include <libint2/util/vector.h>
31 #include <cassert>
32 #include <vector>
33 #include <algorithm>
34 #include <limits>
35 #include <mutex>
36 #include <type_traits>
37 #include <memory>
38 
39 // from now on at least C++11 is required by default
40 #include <libint2/util/cxxstd.h>
41 #if LIBINT2_CPLUSPLUS_STD < 2011
42 # error "Libint2 C++ API requires C++11 support"
43 #endif
44 
45 #include <libint2/boys_fwd.h>
46 #include <libint2/numeric.h>
47 #include <libint2/initialize.h>
48 
49 #if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later
50 extern "C" void dgesv_(const int* n,
51                        const int* nrhs, double* A, const int* lda,
52                        int* ipiv, double* b, const int* ldb,
53                        int* info);
54 #endif
55 
56 namespace libint2 {
57 
58   /// holds tables of expensive quantities
59   template<typename Real>
60   class ExpensiveNumbers {
61     public:
62       ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) {
63         if (ifac >= 0) {
64           fac.resize(ifac + 1);
65           fac[0] = 1.0;
66           for (int i = 1; i <= ifac; i++) {
67             fac[i] = i * fac[i - 1];
68           }
69         }
70 
71         if (idf >= 0) {
72           df.resize(idf + 1);
73           /* df[i] gives (i-1)!!, so that (-1)!! is defined... */
74           df[0] = 1.0;
75           if (idf >= 1)
76             df[1] = 1.0;
77           if (idf >= 2)
78             df[2] = 1.0;
79           for (int i = 3; i <= idf; i++) {
80             df[i] = (i - 1) * df[i - 2];
81           }
82         }
83 
84         if (ibc >= 0) {
85           bc_.resize((ibc+1)*(ibc+1));
86           std::fill(bc_.begin(), bc_.end(), Real(0));
87           bc.resize(ibc+1);
88           bc[0] = &bc_[0];
89           for(int i=1; i<=ibc; ++i)
90             bc[i] = bc[i-1] + (ibc+1);
91 
92           for(int i=0; i<=ibc; i++)
93             bc[i][0] = 1.0;
94           for(int i=0; i<=ibc; i++)
95             for(int j=1; j<=i; ++j)
96               bc[i][j] = bc[i][j-1] * Real(i-j+1) / Real(j);
97         }
98 
99         for (int i = 0; i < 128; i++) {
100           twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0));
101           ihalf[i] = Real(i) - Real(0.5);
102         }
103 
104       }
105 
~ExpensiveNumbers()106       ~ExpensiveNumbers() {
107       }
108 
109       std::vector<Real> fac;
110       std::vector<Real> df;
111       std::vector<Real*> bc;
112 
113       // these quantitites are needed with indices <= mmax
114       // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis functions
115       // but need higher values for Yukawa integrals ...
116       Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */
117       Real ihalf[128]; /* i - 0.5, needed for upward recursion */
118 
119     private:
120       std::vector<Real> bc_;
121   };
122 
123   /** Computes the Boys function, \f$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \f$,
124     * using single algorithm (asymptotic expansion). Slow for the sake of precision control.
125     * Useful in two cases:
126     * <ul>
127     *   <li> for reference purposes, if \c Real supports high/arbitrary precision, and </li>
128     *   <li> for moderate values of \f$ T \f$, if \c Real is a low-precision floating-point type.
129     *        N.B. FmEval_Reference2 , which can compute for all practical values of \f$ T \f$ and \f$ m \f$, is recommended
130     *        with standard \c Real types (\c double and \c float). </li>
131     * </ul>
132     *
133     * \note Precision is controlled heuristically, i.e. cannot be guaranteed mathematically;
134     *       will stop if absolute precision is reached, or precision of \c Real is exhausted.
135     *       It is important that \c std::numeric_limits<Real> is defined appropriately.
136     *
137     * @tparam Real the type to use for all floating-point computations.
138     *         Must be able to compute logarithm and exponential, i.e.
139     *         log(x) and exp(x), where x is Real, must be valid expressions.
140     */
141   template<typename Real>
142   struct FmEval_Reference {
143 
instanceFmEval_Reference144       static std::shared_ptr<const FmEval_Reference> instance(int /* mmax */, Real /* precision */) {
145 
146         // thread-safe per C++11 standard [6.7.4]
147         static auto instance_ = std::make_shared<const FmEval_Reference>();
148 
149         return instance_;
150       }
151 
152       /// computes a single value of \f$ F_m(T) \f$ using MacLaurin series to full precision of @c Real
evalFmEval_Reference153       static Real eval(Real T, size_t m) {
154         assert(m < 100);
155         static const Real T_crit = std::numeric_limits<Real>::is_bounded ? -log( std::numeric_limits<Real>::min() * 100.5 / 2. ) : Real(0) ;
156         if (std::numeric_limits<Real>::is_bounded && T > T_crit)
157           throw std::overflow_error("FmEval_Reference<Real>::eval: Real lacks precision for the given value of argument T");
158         static const Real half = Real(1)/2;
159         Real denom = (m + half);
160         using std::exp;
161         Real term = exp(-T) / (2 * denom);
162         Real old_term = 0;
163         Real sum = term;
164         const Real epsilon = get_epsilon(T);
165         const Real epsilon_divided_10 = epsilon / 10;
166         do {
167           denom += 1;
168           old_term = term;
169           term = old_term * T / denom;
170           sum += term;
171           //rel_error = term / sum , hence iterate until rel_error = epsilon
172           // however, must ensure that contributions are decreasing to ensure that omitted contributions are smaller than epsilon
173         } while (term > sum * epsilon_divided_10 || old_term < term);
174 
175         return sum;
176       }
177 
178       /// fills up an array of Fm(T) for m in [0,mmax]
179       /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long
180       /// @param[in] T the Boys function argument
181       /// @param[in] mmax the maximum value of m for which Boys function will be computed;
evalFmEval_Reference182       static void eval(Real* Fm, Real T, size_t mmax) {
183 
184         // evaluate for mmax using MacLaurin series
185         // it converges fastest for the largest m -> use it to compute Fmmax(T)
186         //  see JPC 94, 5564 (1990).
187         for(size_t m=0; m<=mmax; ++m)
188           Fm[m] = eval(T, m);
189         return;
190       }
191 
192   };
193 
194   /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$,
195     * using multi-algorithm approach (upward recursion for T>=117, and asymptotic summation for T<117).
196     * This is slow and should be used for reference purposes, e.g. computing the interpolation tables.
197     * Precision is not always guaranteed as it is limited by the precision of \c Real type.
198     * When \c Real is \c double, can maintain absolute precision of epsilon for up to m=40.
199     *
200     * @tparam Real the type to use for all floating-point computations.
201     *         Must be able to compute logarithm, exponential, square root, and error function, i.e.
202     *         log(x), exp(x), sqrt(x), and erf(x), where x is Real, must be valid expressions.
203     */
204   template<typename Real>
205   struct FmEval_Reference2 {
206 
instanceFmEval_Reference2207       static std::shared_ptr<const FmEval_Reference2> instance(int /* mmax */, Real /* precision */) {
208 
209         // thread-safe per C++11 standard [6.7.4]
210         static auto instance_ = std::make_shared<const FmEval_Reference2>();
211 
212         return instance_;
213       }
214 
215       /// fills up an array of Fm(T) for m in [0,mmax]
216       /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long
217       /// @param[in] t the Boys function argument
218       /// @param[in] mmax the maximum value of m for which Boys function will be computed;
evalFmEval_Reference2219       static void eval(Real* Fm, Real t, size_t mmax) {
220 
221         if (t < Real(117)) {
222           FmEval_Reference<Real>::eval(Fm,t,mmax);
223         }
224         else {
225           const Real two_over_sqrt_pi{1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727};
226           const Real K = 1/two_over_sqrt_pi;
227 
228           auto t2 = 2*t;
229           auto et = exp(-t);
230           auto sqrt_t = sqrt(t);
231           Fm[0] = K*erf(sqrt_t)/sqrt_t;
232           if (mmax > 0)
233           for(size_t m=0; m<=mmax-1; m++) {
234             Fm[m+1] = ((2*m + 1)*Fm[m] - et)/(t2);
235           }
236         }
237       }
238 
239   };
240 
241   /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$,
242     * using 7-th order Chebyshev interpolation.
243     */
244   template <typename Real = double>
245   class FmEval_Chebyshev7 {
246 
247 #include <libint2/boys_cheb7.h>
248 
249       static_assert(std::is_same<Real,double>::value, "FmEval_Chebyshev7 only supports double as the real type");
250 
251       static constexpr int ORDER = interpolation_order;   //!, interpolation order
252       static constexpr int ORDERp1 = ORDER+1;   //!< ORDER + 1
253 
254       static constexpr Real T_crit = cheb_table_tmax;          //!< critical value of T above which safe to use upward recusion
255       static constexpr Real delta = cheb_table_delta;           //!< interval size
256       static constexpr Real one_over_delta = 1/delta;  //! 1/delta
257 
258       int mmax;                   //!< the maximum m that is tabulated
259       ExpensiveNumbers<double> numbers_;
260       Real *c; /* the Chebyshev coefficients table, T_crit*one_over_delta by mmax*ORDERp1 */
261 
262     public:
263       /// \param m_max maximum value of the Boys function index; set to -1 to skip initialization
264       /// \param precision the desired relative precision
265       /// \throw std::invalid_argument if \c m_max is greater than \c cheb_table_mmax (see boys_cheb7.h)
266       /// \throw std::invalid_argument if \c precision is smaller than std::numeric_limits<double>::epsilon()
267       FmEval_Chebyshev7(int m_max, double precision = std::numeric_limits<double>::epsilon()) :
mmax(m_max)268           mmax(m_max), numbers_(14) {
269 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
270 # if !defined(__AVX__) && defined(NDEBUG)
271         if (libint2::verbose()) {
272           static bool printed_performance_warning = false;
273           if (!printed_performance_warning) {
274             libint2::verbose_stream()
275                 << "libint2::FmEval_Chebyshev7 on x86(-64) platforms needs AVX support for best performance"
276                 << std::endl;
277             printed_performance_warning = true;
278           }
279         }
280 # endif
281 #endif
282         if (precision < std::numeric_limits<double>::epsilon())
283           throw std::invalid_argument(std::string("FmEval_Chebyshev7 does not support precision smaller than ") + std::to_string(std::numeric_limits<double>::epsilon()));
284         if (mmax > cheb_table_mmax)
285           throw std::invalid_argument(
286               "FmEval_Chebyshev7::init() : requested mmax exceeds the "
287               "hard-coded mmax");
288         if (m_max >= 0)
289           init_table();
290       }
~FmEval_Chebyshev7()291       ~FmEval_Chebyshev7() {
292         if (mmax >= 0) {
293           free(c);
294         }
295       }
296 
297       /// Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe fashion
298       static std::shared_ptr<const FmEval_Chebyshev7> instance(int m_max, double = 0.0) {
299 
300         assert(m_max >= 0);
301         // thread-safe per C++11 standard [6.7.4]
302         static auto instance_ = std::make_shared<const FmEval_Chebyshev7>(m_max);
303 
304         while (instance_->max_m() < m_max) {
305           static std::mutex mtx;
306           std::lock_guard<std::mutex> lck(mtx);
307           if (instance_->max_m() < m_max) {
308             auto new_instance = std::make_shared<const FmEval_Chebyshev7>(m_max);
309             instance_ = new_instance;
310           }
311         }
312 
313         return instance_;
314       }
315 
316       /// @return the maximum value of m for which the Boys function can be computed with this object
max_m()317       int max_m() const { return mmax; }
318 
319       /// fills in Fm with computed Boys function values for m in [0,mmax]
320       /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long
321       /// @param[in] x the Boys function argument
322       /// @param[in] mmax the maximum value of m for which Boys function will be computed; mmax must be <= the value returned by max_m
eval(Real * Fm,Real x,int m_max)323       inline void eval(Real* Fm, Real x, int m_max) const {
324 
325         // large T => use upward recursion
326         // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
327         if (x > T_crit) {
328           const double one_over_x = 1/x;
329           Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
330           if (m_max == 0)
331             return;
332           // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is small enough to guarantee full double precision
333           for (int i = 1; i <= m_max; i++)
334             Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
335           return;
336         }
337 
338         // ---------------------------------------------
339         // small and intermediate arguments => interpolate Fm and (optional) downward recursion
340         // ---------------------------------------------
341         // which interval does this x fall into?
342         const Real x_over_delta = x * one_over_delta;
343         const int iv = int(x_over_delta); // the interval index
344         const Real xd = x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5
345         const int m_min = 0;
346 
347 #if defined(__AVX__)
348         const auto x2 = xd*xd;
349         const auto x3 = x2*xd;
350         const auto x4 = x2*x2;
351         const auto x5 = x2*x3;
352         const auto x6 = x3*x3;
353         const auto x7 = x3*x4;
354         libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3);
355         libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7);
356 #endif // AVX
357 
358         const Real *d = c + (ORDERp1) * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin
359         int m = m_min;
360 #if defined(__AVX__)
361         if (m_max-m >=3) {
362           const int unroll_size = 4;
363           const int m_fence = (m_max + 2 - unroll_size);
364           for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
365             libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v,
366                                            d20v, d21v, d30v, d31v;
367             d00v.load_aligned(d);            d01v.load_aligned((d+4));
368             d10v.load_aligned(d+ORDERp1);    d11v.load_aligned((d+4)+ORDERp1);
369             d20v.load_aligned(d+2*ORDERp1);  d21v.load_aligned((d+4)+2*ORDERp1);
370             d30v.load_aligned(d+3*ORDERp1);  d31v.load_aligned((d+4)+3*ORDERp1);
371             libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
372             libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
373             libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec;
374             libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec;
375             libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3);
376             sum0123.convert(&Fm[m]);
377           }
378         } // unroll_size=4
379         if (m_max-m >=1) {
380           const int unroll_size = 2;
381           const int m_fence = (m_max + 2 - unroll_size);
382           for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
383             libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v;
384             d00v.load_aligned(d);
385             d01v.load_aligned((d+4));
386             d10v.load_aligned(d+ORDERp1);
387             d11v.load_aligned((d+4)+ORDERp1);
388             libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
389             libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
390             libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
391             sum01.convert(&Fm[m]);
392           }
393         } // unroll_size=2
394         { // no unrolling
395           for(; m<=m_max; ++m, d+=ORDERp1) {
396             libint2::simd::VectorAVXDouble d0v, d1v;
397             d0v.load_aligned(d);
398             d1v.load_aligned(d+4);
399             Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec);
400           }
401         }
402 #else // AVX not available
403         for(m=m_min; m<=m_max; ++m, d+=ORDERp1) {
404           Fm[m] = d[0]
405                 + xd * (d[1]
406                 + xd * (d[2]
407                 + xd * (d[3]
408                 + xd * (d[4]
409                 + xd * (d[5]
410                 + xd * (d[6]
411                 + xd * (d[7])))))));
412 
413           //        // check against the reference value
414           //        if (false) {
415           //          double refvalue = FmEval_Reference2<double>::eval(x, m); // compute F(T)
416           //          if (abs(refvalue - Fm[m]) > 1e-10) {
417           //            std::cout << "T = " << x << " m = " << m << " cheb = "
418           //                      << Fm[m] << " ref = " << refvalue << std::endl;
419           //          }
420           //        }
421         }
422 #endif
423 
424 
425       } // eval()
426 
427     private:
428 
init_table()429       void init_table() {
430 
431         // get memory
432         void* result;
433         int status = posix_memalign(&result, ORDERp1*sizeof(Real), (mmax + 1) * cheb_table_nintervals * ORDERp1 * sizeof(Real));
434         if (status != 0) {
435           if (status == EINVAL)
436             throw std::logic_error(
437               "FmEval_Chebyshev7::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
438           if (status == ENOMEM)
439             throw std::bad_alloc();
440           abort();  // should be unreachable
441         }
442         c = static_cast<Real*>(result);
443 
444         // copy contents of static table into c
445         // need all intervals
446         for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
447           // but only values of m up to mmax
448           std::copy(cheb_table[iv], cheb_table[iv]+(mmax+1)*ORDERp1, c+(iv*(mmax+1))*ORDERp1);
449         }
450       }
451 
452   }; // FmEval_Chebyshev7
453 
454 #if LIBINT2_CONSTEXPR_STATICS
455   template <typename Real>
456   constexpr
457   double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
458 #else
459   // clang needs an explicit specifalization declaration to avoid warning
460 #  ifdef __clang__
461   template <typename Real>
462   double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
463 #  endif
464 #endif
465 
466 #ifndef STATIC_OON
467 #define STATIC_OON
468   namespace {
469     const double oon[] = {0.0, 1.0, 1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0, 1.0/6.0, 1.0/7.0, 1.0/8.0, 1.0/9.0, 1.0/10.0, 1.0/11.0};
470   }
471 #endif
472 
473   /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$,
474     * using Taylor interpolation of up to 8-th order.
475     * @tparam Real the type to use for all floating-point computations. Following expressions must be valid: exp(Real), pow(Real), fabs(Real), max(Real), and floor(Real).
476     * @tparam INTERPOLATION_ORDER the interpolation order. The higher the order the less memory this object will need, but the computational cost will increase (usually very slightly)
477     */
478   template<typename Real = double, int INTERPOLATION_ORDER = 7>
479   class FmEval_Taylor {
480     public:
481       static_assert(std::is_same<Real,double>::value, "FmEval_Taylor only supports double as the real type");
482 
483       static const int max_interp_order = 8;
484       static const bool INTERPOLATION_AND_RECURSION = false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use interpolation only
485       const double soft_zero_;
486 
487       /// Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative \c precision
488       FmEval_Taylor(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) :
489           soft_zero_(1e-6), cutoff_(precision), numbers_(
490               INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER - 1)) {
491 
492 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
493 # if !defined(__AVX__) && defined(NDEBUG)
494         if (libint2::verbose()) {
495           static bool printed_performance_warning = false;
496           if (!printed_performance_warning) {
497             libint2::verbose_stream()
498                 << "libint2::FmEval_Taylor on x86(-64) platforms needs AVX support for best performance"
499                 << std::endl;
500             printed_performance_warning = true;
501           }
502         }
503 # endif
504 #endif
505 
506         assert(mmax <= 63);
507 
508         const double sqrt_pi = std::sqrt(M_PI);
509 
510         /*---------------------------------------
511          We are doing Taylor interpolation with
512          n=TAYLOR_ORDER terms here:
513          error <= delT^n/(n+1)!
514          ---------------------------------------*/
515         delT_ = 2.0
516             * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1],
517                        1.0 / INTERPOLATION_ORDER);
518         oodelT_ = 1.0 / delT_;
519         max_m_ = mmax + INTERPOLATION_ORDER - 1;
520 
521         T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/
522         max_T_ = 0;
523         /*--- Figure out T_crit for each m and put into the T_crit ---*/
524         for (int m = max_m_; m >= 0; --m) {
525           /*------------------------------------------
526            Damped Newton-Raphson method to solve
527            T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5)
528            The solution is the max T for which to do
529            the interpolation
530            ------------------------------------------*/
531           double T = -log(cutoff_);
532           const double egamma = cutoff_ * sqrt_pi * numbers_.df[2 * m]
533               / std::pow(2.0, m);
534           double T_new = T;
535           double func;
536           do {
537             const double damping_factor = 0.2;
538             T = T_new;
539             /* f(T) = the difference between LHS and RHS of the equation above */
540             func = std::pow(T, m - 0.5) * std::exp(-T) - egamma;
541             const double dfuncdT = ((m - 0.5) * std::pow(T, m - 1.5)
542                 - std::pow(T, m - 0.5)) * std::exp(-T);
543             /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to the left of the hump. Make a big step to the right. */
544             if (dfuncdT > 0.0) {
545               T_new *= 2.0;
546             } else {
547               /* damp the step */
548               double deltaT = -func / dfuncdT;
549               const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0;
550               const double max_deltaT = damping_factor * T;
551               if (std::fabs(deltaT) > max_deltaT)
552                 deltaT = sign_deltaT * max_deltaT;
553               T_new = T + deltaT;
554             }
555             if (T_new <= 0.0) {
556               T_new = T / 2.0;
557             }
558           } while (std::fabs(func / egamma) >= soft_zero_);
559           T_crit_[m] = T_new;
560           const int T_idx = (int) std::floor(T_new / delT_);
561           max_T_ = std::max(max_T_, T_idx);
562         }
563 
564         // allocate the grid (see the comments below)
565         {
566           const int nrow = max_T_ + 1;
567           const int ncol = max_m_ + 1;
568           grid_ = new Real*[nrow];
569           grid_[0] = new Real[nrow * ncol];
570           //std::cout << "Allocated interpolation table of " << nrow * ncol << " reals" << std::endl;
571           for (int r = 1; r < nrow; ++r)
572             grid_[r] = grid_[r - 1] + ncol;
573         }
574 
575         /*-------------------------------------------------------
576          Tabulate the gamma function from t=delT to T_crit[m]:
577          1) include T=0 though the table is empty for T=0 since
578          Fm(0) is simple to compute
579          -------------------------------------------------------*/
580         /*--- do the mmax first ---*/
581         for (int T_idx = max_T_; T_idx >= 0; --T_idx) {
582           const double T = T_idx * delT_;
583           libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_);
584         }
585       }
586 
~FmEval_Taylor()587       ~FmEval_Taylor() {
588         delete[] T_crit_;
589         delete[] grid_[0];
590         delete[] grid_;
591       }
592 
593       /// Singleton interface allows to manage the lone instance;
594       /// adjusts max m and precision values as needed in thread-safe fashion
595       static std::shared_ptr<const FmEval_Taylor> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
596         assert(mmax >= 0);
597         assert(precision >= 0);
598         // thread-safe per C++11 standard [6.7.4]
599         static auto instance_ = std::make_shared<const FmEval_Taylor>(mmax, precision);
600 
601         while (instance_->max_m() < mmax || instance_->precision() > precision) {
602           static std::mutex mtx;
603           std::lock_guard<std::mutex> lck(mtx);
604           if (instance_->max_m() < mmax || instance_->precision() > precision) {
605             auto new_instance = std::make_shared<const FmEval_Taylor>(mmax, precision);
606             instance_ = new_instance;
607           }
608         }
609 
610         return instance_;
611       }
612 
613       /// @return the maximum value of m for which this object can compute the Boys function
max_m()614       int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; }
615       /// @return the precision with which this object can compute the Boys function
precision()616       Real precision() const { return cutoff_; }
617 
618       /// computes Boys function values with m index in range [0,mmax]
619       /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long
620       /// @param[in] x the Boys function argument
621       /// @param[in] mmax the maximum value of m for which Boys function will be computed;
622       ///                  it must be <= the value returned by max_m() (this is not checked)
eval(Real * Fm,Real T,int mmax)623       void eval(Real* Fm, Real T, int mmax) const {
624         const double sqrt_pio2 = 1.2533141373155002512;
625         const double two_T = 2.0 * T;
626 
627         // stop recursion at mmin
628         const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0;
629         /*-------------------------------------
630          Compute Fm(T) from mmax down to mmin
631          -------------------------------------*/
632         const bool use_upward_recursion = true;
633         if (use_upward_recursion) {
634 //          if (T > 30.0) {
635           if (T > T_crit_[0]) {
636             const double one_over_x = 1.0/T;
637             Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
638             if (mmax == 0)
639               return;
640             // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
641             for (int i = 1; i <= mmax; i++)
642               Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
643             return;
644           }
645         }
646 
647         // since Tcrit grows with mmax, this condition only needs to be determined once
648         if (T > T_crit_[mmax]) {
649           double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5);
650           for (int m = mmax; m >= mmin; --m) {
651             /*--- Asymptotic formula ---*/
652             Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05;
653             pow_two_T_to_minusjp05 *= two_T;
654           }
655         }
656         else
657         {
658           const int T_ind = (int) (0.5 + T * oodelT_);
659           const Real h = T_ind * delT_ - T;
660           const Real* F_row = grid_[T_ind] + mmin;
661 
662 #if defined (__AVX__)
663           libint2::simd::VectorAVXDouble h0123, h4567;
664           if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
665             const double h2 = h*h*oon[2];
666             const double h3 = h2*h*oon[3];
667             h0123 = libint2::simd::VectorAVXDouble (1.0, h, h2, h3);
668             if (INTERPOLATION_ORDER == 7) {
669               const double h4 = h3*h*oon[4];
670               const double h5 = h4*h*oon[5];
671               const double h6 = h5*h*oon[6];
672               const double h7 = h6*h*oon[7];
673               h4567 = libint2::simd::VectorAVXDouble (h4, h5, h6, h7);
674             }
675           }
676           //          libint2::simd::VectorAVXDouble h0123(1.0);
677           //          libint2::simd::VectorAVXDouble h4567(1.0);
678 #endif
679 
680           int m = mmin;
681           if (mmax-m >=1) {
682             const int unroll_size = 2;
683             const int m_fence = (mmax + 2 - unroll_size);
684             for(; m<m_fence; m+=unroll_size, F_row+=unroll_size) {
685 
686 #if defined(__AVX__)
687               if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
688                  libint2::simd::VectorAVXDouble fr0_0123; fr0_0123.load(F_row);
689                  libint2::simd::VectorAVXDouble fr1_0123; fr1_0123.load(F_row+1);
690                  libint2::simd::VectorSSEDouble fm01 = horizontal_add(fr0_0123*h0123, fr1_0123*h0123);
691                  if (INTERPOLATION_ORDER == 7) {
692                    libint2::simd::VectorAVXDouble fr0_4567; fr0_4567.load(F_row+4);
693                    libint2::simd::VectorAVXDouble fr1_4567; fr1_4567.load(F_row+5);
694                    fm01 += horizontal_add(fr0_4567*h4567, fr1_4567*h4567);
695                  }
696                  fm01.convert(&Fm[m]);
697               }
698               else {
699 #endif
700               Real total0 = 0.0, total1 = 0.0;
701               for(int i=INTERPOLATION_ORDER; i>=1; --i) {
702                 total0 = oon[i]*h*(F_row[i] + total0);
703                 total1 = oon[i]*h*(F_row[i+1] + total1);
704               }
705               Fm[m] = F_row[0] + total0;
706               Fm[m+1] = F_row[1] + total1;
707 #if defined(__AVX__)
708               }
709 #endif
710             }
711           } // unroll_size = 2
712           if (m<=mmax) { // unroll_size = 1
713 #if defined(__AVX__)
714             if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
715               libint2::simd::VectorAVXDouble fr0123; fr0123.load(F_row);
716               if (INTERPOLATION_ORDER == 7) {
717                 libint2::simd::VectorAVXDouble fr4567; fr4567.load(F_row+4);
718 //                libint2::simd::VectorSSEDouble fm = horizontal_add(fr0123*h0123, fr4567*h4567);
719 //                Fm[m] = horizontal_add(fm);
720                 Fm[m] = horizontal_add(fr0123*h0123 + fr4567*h4567);
721               }
722               else { // INTERPOLATION_ORDER == 3
723                 Fm[m] = horizontal_add(fr0123*h0123);
724               }
725             }
726             else {
727 #endif
728             Real total = 0.0;
729             for(int i=INTERPOLATION_ORDER; i>=1; --i) {
730               total = oon[i]*h*(F_row[i] + total);
731             }
732             Fm[m] = F_row[0] + total;
733 #if defined(__AVX__)
734             }
735 #endif
736           } // unroll_size = 1
737 
738           // check against the reference value
739 //          if (false) {
740 //            double refvalue = FmEval_Reference2<double>::eval(T, mmax); // compute F(T) with m=mmax
741 //            if (abs(refvalue - Fm[mmax]) > 1e-14) {
742 //              std::cout << "T = " << T << " m = " << mmax << " cheb = "
743 //                  << Fm[mmax] << " ref = " << refvalue << std::endl;
744 //            }
745 //          }
746 
747         } // if T < T_crit
748 
749         /*------------------------------------
750          And then do downward recursion in j
751          ------------------------------------*/
752         if (INTERPOLATION_AND_RECURSION && mmin > 0) {
753           const Real exp_mT = std::exp(-T);
754           for (int m = mmin - 1; m >= 0; --m) {
755             Fm[m] = (exp_mT + two_T * Fm[m+1]) * numbers_.twoi1[m];
756           }
757         }
758       }
759 
760     private:
761       Real **grid_; /* Table of "exact" Fm(T) values. Row index corresponds to
762        values of T (max_T+1 rows), column index to values
763        of m (max_m+1 columns) */
764       Real delT_; /* The step size for T, depends on cutoff */
765       Real oodelT_; /* 1.0 / delT_, see above */
766       Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */
767       int max_m_; /* Maximum value of m in the table, depends on cutoff
768        and the number of terms in Taylor interpolation */
769       int max_T_; /* Maximum index of T in the table, depends on cutoff
770        and m */
771       Real *T_crit_; /* Maximum T for each row, depends on cutoff;
772        for a given m and T_idx <= max_T_idx[m] use Taylor interpolation,
773        for a given m and T_idx > max_T_idx[m] use the asymptotic formula */
774 
775       ExpensiveNumbers<double> numbers_;
776 
777       /**
778        * Power series estimate of the error introduced by replacing
779        * \f$ F_m(T) = \int_0^1 \exp(-T t^2) t^{2 m} \, \mathrm{d} t \f$ with analytically
780        * integrable \f$ \int_0^\infty \exp(-T t^2) t^{2 m} \, \mathrm{d} t = \frac{(2m-1)!!}{2^{m+1}} \sqrt{\frac{\pi}{T^{2m+1}}} \f$
781        * @param m
782        * @param T
783        * @return the error estimate
784        */
truncation_error(unsigned int m,double T)785       static double truncation_error(unsigned int m, double T) {
786         const double m2= m * m;
787         const double m3= m2 * m;
788         const double m4= m2 * m2;
789         const double T2= T * T;
790         const double T3= T2 * T;
791         const double T4= T2 * T2;
792         const double T5= T2 * T3;
793 
794         const double result = exp(-T) * (105 + 16*m4 + 16*m3*(T - 8) - 30*T + 12*T2
795             - 8*T3 + 16*T4 + 8*m2*(43 - 9*T + 2*T2) +
796             4*m*(-88 + 23*T - 8*T2 + 4*T3))/(32*T5);
797         return result;
798       }
799       /**
800        * Leading-order estimate of the error introduced by replacing
801        * \f$ F_m(T) = \int_0^1 \exp(-T t^2) t^{2 m} \, \mathrm{d} t \f$ with analytically
802        * integrable \f$ \int_0^\infty \exp(-T t^2) t^{2 m} \, \mathrm{d} t = \frac{(2m-1)!!}{2^{m+1}} \sqrt{\frac{\pi}{T^{2m+1}}} \f$
803        * @param m
804        * @param T
805        * @return the error estimate
806        */
truncation_error(double T)807       static double truncation_error(double T) {
808         const double result = exp(-T) /(2*T);
809         return result;
810       }
811   };
812 
813 
814   namespace detail {
pow10(long exp)815   constexpr double pow10(long exp) {
816     return (exp == 0) ? 1. : ((exp > 0) ? 10. * pow10(exp-1) : 0.1 * pow10(exp+1));
817   }
818   }
819 
820   //////////////////////////////////////////////////////////
821   /// core integral for Yukawa and exponential interactions
822   //////////////////////////////////////////////////////////
823 
824   /**
825    * Evaluates core integral for Gaussian integrals over the Yukawa potential \f$ \exp(- \zeta r) / r \f$ and
826    * the exponential interaction \f$ \exp(- \zeta r) \f$
827    * @tparam Real real type
828    * @warning only @p Real = double is supported
829    * @warning guarantees absolute precision of only about 1e-14
830    */
831   template<typename Real = double>
832   struct TennoGmEval {
833 
834   private:
835 
836     #include <libint2/tenno_cheb.h>
837 
838       static_assert(std::is_same<Real,double>::value, "TennoGmEval only supports double as the real type");
839 
840       static const int mmin_ = -1;
841       static constexpr Real Tmax = (1 << cheb_table_tmaxlog2); //!< critical value of T above which use upward recursion
842       static constexpr Real Umax = detail::pow10(cheb_table_umaxlog10); //!< max value of U for which to interpolate
843       static constexpr Real Umin = detail::pow10(cheb_table_uminlog10); //!< min value of U for which to interpolate
844       static constexpr std::size_t ORDERp1 = interpolation_order + 1;
845       static constexpr Real maxabsprecision = 1.4e-14;  //!< guaranteed abs precision of the interpolation table for m>0
846 
847   public:
848       /// \param m_max maximum value of the Gm function index
849       /// \param precision the desired *absolute* precision (relative precision for most intervals will be below epsilon, but for large T/U values and high m relative precision is low
850       /// \throw std::invalid_argument if \c m_max is greater than \c cheb_table_mmax (see tenno_cheb.h)
851       /// \throw std::invalid_argument if \c precision is smaller than \c maxabsprecision
852       TennoGmEval(unsigned int mmax, Real precision = -1) :
mmax_TennoGmEval853         mmax_(mmax), precision_(precision),
854         numbers_() {
855 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
856 # if !defined(__AVX__) && defined(NDEBUG)
857         if (libint2::verbose()) {
858           static bool printed_performance_warning = false;
859           if (!printed_performance_warning) {
860             libint2::verbose_stream()
861                 << "libint2::TennoGmEval on x86(-64) platforms needs AVX support for best performance"
862                 << std::endl;
863             printed_performance_warning = true;
864           }
865         }
866 # endif
867 #endif
868 
869 //        if (precision_ < maxabsprecision)
870 //          throw std::invalid_argument(
871 //              std::string(
872 //                  "TennoGmEval does not support precision smaller than ") +
873 //              std::to_string(maxabsprecision));
874 
875         if (mmax > cheb_table_mmax)
876           throw std::invalid_argument(
877               "TennoGmEval::init() : requested mmax exceeds the "
878               "hard-coded mmax");
879         init_table();
880       }
881 
~TennoGmEvalTennoGmEval882       ~TennoGmEval() {
883         if (c_ != nullptr)
884           free(c_);
885       }
886 
887       /// Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe fashion
888       static std::shared_ptr<const TennoGmEval> instance(int m_max, double = 0) {
889 
890         assert(m_max >= 0);
891         // thread-safe per C++11 standard [6.7.4]
892         static auto instance_ = std::make_shared<const TennoGmEval>(m_max);
893 
894         while (instance_->max_m() < m_max) {
895           static std::mutex mtx;
896           std::lock_guard<std::mutex> lck(mtx);
897           if (instance_->max_m() < m_max) {
898             auto new_instance = std::make_shared<const TennoGmEval>(m_max);
899             instance_ = new_instance;
900           }
901         }
902 
903         return instance_;
904       }
905 
max_mTennoGmEval906       unsigned int max_m() const { return mmax_; }
907       /// @return the precision with which this object can compute the result
precisionTennoGmEval908       Real precision() const { return precision_; }
909 
910       /// @param[in] Gm pointer to array of @c mmax+1 @c Real elements, on
911       ///               return this contains the core integral for Yukawa
912       ///               interaction, \f$ \exp(-zeta r_{12})/r_{12} \f$ ,
913       ///               namely \f$ G_{m}(T,U) = \int_0^1 t^{2m} \exp(U(1-t^{-2}) - Tt^2) dt, m \in [0,m_\mathrm{max}] \f$, where
914       ///               \f$ T = \rho |\vec{P} - \vec{Q}|^2 \f$ and
915       ///               \f$ U = \zeta^2 / (4 \rho) \f$
916       /// @param[in] one_over_rho \f$ 1/\rho \f$
917       /// @param[in] T \f$ T \f$
918       /// @param[in] mmax \f$ m_\mathrm{max} \f$
919       /// @param[in] zeta \f$ \zeta \f$
eval_yukawaTennoGmEval920       void eval_yukawa(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
921         assert(mmax <= mmax_);
922         assert(T >= 0);
923         const auto U = 0.25 * zeta * zeta * one_over_rho;
924         assert(U >= 0);
925         if (T > Tmax || U < Umin) {
926           eval_Gm_urr(Gm, T, U, mmax, 0); // no need for G_-1
927         } else {
928           interpolate_Gm<false>(Gm, T, U, 0, mmax);
929         }
930       }
931       /// @param[in] Gm pointer to array of @c mmax+1 @c Real elements, on
932       ///               return this contains the core integral for Slater-type
933       ///               geminal, \f$ \exp(-zeta r_{12}) \f$ ,
934       ///               namely \f$ \sqrt{U} (G_{m-1}(T,U) - G_m(T,U)), m \in [0,m_\mathrm{max}] \f$ where\
935       //                \f$ G_{m}(T,U) = \int_0^1 t^{2m} \exp(U(1-t^{-2}) - Tt^2) dt \f$, where
936       ///               \f$ T = \rho |\vec{P} - \vec{Q}|^2 \f$ and
937       ///               \f$ U = \zeta^2 / (4 \rho) \f$
938       /// @param[in] one_over_rho \f$ 1/\rho \f$
939       /// @param[in] T \f$ T \f$
940       /// @param[in] mmax \f$ m_\mathrm{max} \f$
941       /// @param[in] zeta \f$ \zeta \f$
eval_slaterTennoGmEval942       void eval_slater(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
943         assert(mmax <= mmax_);
944         assert(T >= 0);
945         const auto U = 0.25 * zeta * zeta * one_over_rho;
946         assert(U > 0);  // integral factorizes into 2 overlaps for U = 0
947         const auto zeta_over_two_rho = 0.5 * zeta * one_over_rho;
948         if (T > Tmax) {
949           eval_Gm_urr(Gm, T, U, mmax, -1);
950         } else {
951           interpolate_Gm<true>(Gm, T, U, zeta_over_two_rho, mmax);
952         }
953       }
954 
955   private:
956 
957       /// Computes G_0 and (optionally) G_{-1}
958       /// @tparam compute_Gm1, if true, compute G_0 and G_{-1}, otherwise G_0 only
959       /// @param[in] T the value of \f$ T \f$
960       /// @param[in] U the value of \f$ U \f$
961       /// @return if @c compute_Gm1==true return {G_0,G_{-1}}, otherwise {G_0,0}
962       template <bool compute_Gm1>
eval_G0_and_maybe_Gm1TennoGmEval963       static inline std::tuple<Real,Real> eval_G0_and_maybe_Gm1(Real T, Real U) {
964         assert(T >= 0);
965         assert(U >= 0);
966         const Real sqrtPi(1.7724538509055160272981674833411451827975494561224);
967 
968         Real G_m1 = 0;
969         Real G_0 = 0;
970         if (U == 0) {  // \sqrt{U} G_{-1} is finite, need to handle that case separately
971           assert(!compute_Gm1);
972           if (T < std::numeric_limits<Real>::epsilon()) {
973             G_0 = 1;
974           }
975           else {
976             const Real sqrt_T = sqrt(T);
977             const Real sqrtPi_over_2 = sqrtPi * 0.5;
978             G_0 = sqrtPi_over_2 * erf(sqrt_T) / sqrt_T;
979           }
980         }
981         else if (T > std::numeric_limits<Real>::epsilon()) {  // U > 0
982           const Real sqrt_U = sqrt(U);
983           const Real sqrt_T = sqrt(T);
984           const Real oo_sqrt_T = 1 / sqrt_T;
985           const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
986           const Real kappa = sqrt_U - sqrt_T;
987           const Real lambda = sqrt_U + sqrt_T;
988           const Real sqrtPi_over_4 = sqrtPi * 0.25;
989           const Real pfac = sqrtPi_over_4;
990           const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa);
991           const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda);
992 
993           G_m1 = compute_Gm1 ? pfac * (erfc_k + erfc_l) * oo_sqrt_U : 0.0;
994           G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
995         }
996         else {  // T = 0, U > 0
997           const Real exp_U = exp(U);
998           const Real sqrt_U = sqrt(U);
999           if (compute_Gm1) {
1000             const Real sqrtPi_over_2 = sqrtPi * 0.5;
1001             const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
1002 
1003             G_m1 = exp_U * sqrtPi_over_2 * erfc(sqrt_U) * oo_sqrt_U;
1004           }
1005           G_0 = 1 - exp_U * sqrtPi * erfc(sqrt_U) * sqrt_U;
1006         }
1007 
1008         return std::make_tuple(G_0, G_m1);
1009       }
1010 
1011       /// Computes core integrals by upward recursion from \f$ G_{-1} (T,U) \f$ and \f$ G_0 (T,U) \f$
1012       /// this is unstable for small T, so use when interpolation does not apply.
1013       /// @param[out] Gm \f$ G_m(T,U), m=mmin..mmax \f$ , i.e. @c Gm[m] \f$ = G_{\mathrm{mmin} + m}(T,U) \f$
1014       /// @param[in] T the value of \f$ T \f$
1015       /// @param[in] U the value of \f$ U \f$
1016       /// @param[in] mmax the maximum value of \f$ m \f$
1017       /// @param[in] mmin the minimum value of \f$ m \f$ ; the only valid values are 0 and -1
eval_Gm_urrTennoGmEval1018       static void eval_Gm_urr(Real* Gm, Real T, Real U, size_t mmax, long mmin) {
1019         assert(mmin == 0 || mmin == -1);
1020         assert(T > 0);
1021         assert(U > 0);
1022 
1023         const Real sqrt_U = sqrt(U);
1024         const Real sqrt_T = sqrt(T);
1025         const Real oo_sqrt_T = 1 / sqrt_T;
1026         const Real oo_sqrt_U = 1 / sqrt_U;
1027         const Real kappa = sqrt_U - sqrt_T;
1028         const Real lambda = sqrt_U + sqrt_T;
1029         const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1030         const Real pfac = sqrtPi_over_4;
1031         const Real erfc_k = exp(kappa*kappa - T) * erfc(kappa);
1032         const Real erfc_l = exp(lambda*lambda - T) * erfc(lambda);
1033 
1034         Real G_m1_value;
1035         Real& G_m1 = (mmin == -1) ? Gm[0] : G_m1_value;
1036         G_m1 = pfac * (erfc_k + erfc_l) * oo_sqrt_U;
1037         Real& G_0 = (mmin == -1) ? Gm[1] : Gm[0];
1038         G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
1039 
1040         if (mmax > 0) {
1041 
1042           // first application of URR
1043           const Real oo_two_T = 0.5 / T;
1044           const Real two_U = 2.0 * U;
1045           const Real exp_mT = exp(-T);
1046 
1047           Real* Gmm1 = &G_m1;
1048           Real* Gm0 = &G_0;
1049           Real* Gmp1 = Gm0 + 1;
1050           for(unsigned int m=1, two_m_minus_1=1; m<=mmax; ++m, two_m_minus_1+=2, ++Gmp1) {
1051             *Gmp1 = oo_two_T * ( two_m_minus_1 * *Gm0 + two_U * *Gmm1 - exp_mT);
1052             Gmm1 = Gm0;
1053             Gm0 = Gmp1;
1054           }
1055         }
1056 
1057         return;
1058       }
1059 
1060       /// compute core integrals by Chebyshev interpolation
1061       /// @tparam[in] exp if true, will compute core integrals of the exponential operator, otherwise will compute Yukawa
1062       /// @param[out] Gm_vec if @c exp==false Gm_vec[m]=\f$ G_m(T,U), m=0..mmax \f$, otherwise Gm_vec[m]=\f$ \frac{\zeta}{2 \rho} \left( G_{m-1}(T,U) - G_m(T,U) \right), m=0..mmax \f$ ; must be at least @c mmax+1 elements long
1063       /// @param[in] T the value of \f$ T \f$
1064       /// @param[in] U the value of \f$ U \f$
1065       /// @param[in] zeta_over_two_rho the value of \f$ \frac{\zeta}{2 \rho} \f$, only used if @c exp==true
1066       /// @param[in] mmax the maximum value of \f$ m \f$
1067       template <bool exp>
interpolate_GmTennoGmEval1068       void interpolate_Gm(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho, long mmax) const {
1069         assert(T >= 0);
1070         assert(U >= Umin && U <= Umax);
1071 
1072         // maps x in [0,2] to [-1/2,1/2] in a linear fashion
1073         auto linear_map_02 = [](Real x) {
1074           assert(x >= 0 && x <= 2);
1075           return (x - 1) * 0.5;
1076         };
1077         // maps x in [a, 2*a] to [-1/2,1/2] in a logarithmic fashion
1078         auto log2_map = [](Real x, Real one_over_a) {
1079           return std::log2(x * one_over_a) - 0.5;
1080         };
1081         // maps x in [a, 10*a] to [-1/2,1/2] in a logarithmic fashion
1082         auto log10_map = [](Real x, Real one_over_a) {
1083           return std::log10(x * one_over_a) - 0.5;
1084         };
1085 
1086         // which interval does this T fall into?
1087         const int Tint = T < 2 ? 0 : int(std::floor(std::log2(T)));  assert(Tint >= 0 && Tint < 10);
1088         // precomputed 1 / 2^K
1089         constexpr Real one_over_2K[] = {1, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, .001953125};
1090         // which interval does this U fall into?
1091         const int Uint = int(std::floor(std::log10(U / Umin))); assert(Uint >= 0 && Uint < 10);
1092         // precomputed 1 / 10^(cheb_table_uminlog10 + K)
1093         constexpr Real one_over_10K[] = {1. / detail::pow10(cheb_table_uminlog10),
1094                                                1. / detail::pow10(cheb_table_uminlog10+1),
1095                                                1. / detail::pow10(cheb_table_uminlog10+2),
1096                                                1. / detail::pow10(cheb_table_uminlog10+3),
1097                                                1. / detail::pow10(cheb_table_uminlog10+4),
1098                                                1. / detail::pow10(cheb_table_uminlog10+5),
1099                                                1. / detail::pow10(cheb_table_uminlog10+6),
1100                                                1. / detail::pow10(cheb_table_uminlog10+7),
1101                                                1. / detail::pow10(cheb_table_uminlog10+8),
1102                                                1. / detail::pow10(cheb_table_uminlog10+9)};
1103         const Real t = Tint == 0 ? linear_map_02(T) : log2_map(T, one_over_2K[Tint]); // this ranges from -0.5 to 0.5
1104         const Real u = log10_map(U, one_over_10K[Uint]); // this ranges from -0.5 to 0.5
1105 
1106         const int interval = Tint * 10 + Uint;
1107 
1108 #if defined(__AVX__)
1109         assert(ORDERp1 == 16);
1110         const auto t2 = t*t;
1111         const auto t3 = t2*t;
1112         const auto t4 = t2*t2;
1113         const auto t5 = t2*t3;
1114         const auto t6 = t3*t3;
1115         const auto t7 = t3*t4;
1116         const auto t8 = t4*t4;
1117         const auto t9 = t4*t5;
1118         const auto t10 = t5*t5;
1119         const auto t11 = t5*t6;
1120         const auto t12 = t6*t6;
1121         const auto t13 = t6*t7;
1122         const auto t14 = t7*t7;
1123         const auto t15 = t7*t8;
1124         const auto u2 = u*u;
1125         const auto u3 = u2*u;
1126         const auto u4 = u2*u2;
1127         const auto u5 = u2*u3;
1128         const auto u6 = u3*u3;
1129         const auto u7 = u3*u4;
1130         const auto u8 = u4*u4;
1131         const auto u9 = u4*u5;
1132         const auto u10 = u5*u5;
1133         const auto u11 = u5*u6;
1134         const auto u12 = u6*u6;
1135         const auto u13 = u6*u7;
1136         const auto u14 = u7*u7;
1137         const auto u15 = u7*u8;
1138         libint2::simd::VectorAVXDouble u0vec(1., u, u2, u3);
1139         libint2::simd::VectorAVXDouble u1vec(u4, u5, u6, u7);
1140         libint2::simd::VectorAVXDouble u2vec(u8, u9, u10, u11);
1141         libint2::simd::VectorAVXDouble u3vec(u12, u13, u14, u15);
1142 #endif // AVX
1143 
1144         Real Gmm1 = 0.0; // will track the previous value, only used for exp==false
1145 
1146         constexpr bool compute_Gmm10 = true;
1147         long mmin_interp;
1148         if (compute_Gmm10) {
1149           // precision of interpolation for m=-1,0 can be insufficient, just evaluate explicitly
1150           Real G0;
1151           std::tie(exp ? G0 : Gm_vec[0], Gmm1) = eval_G0_and_maybe_Gm1<exp>(T, U);
1152           if (exp) {
1153             Gm_vec[0] = (Gmm1 - G0) * zeta_over_two_rho;
1154             Gmm1 = G0;
1155           }
1156           mmin_interp = 1;
1157         }
1158         else
1159           mmin_interp = exp ? -1 : 0;
1160 
1161         // now compute the rest
1162         for(long m=mmin_interp; m<=mmax; ++m){
1163             const Real *c_tuint = c_ + (ORDERp1) * (ORDERp1) * (interval * (mmax_ - mmin_ + 1) + (m - mmin_)); // ptr to the interpolation data for m=mmin
1164 #if defined(__AVX__)
1165             libint2::simd::VectorAVXDouble c00v, c01v, c02v, c03v, c10v, c11v, c12v, c13v,
1166                                            c20v, c21v, c22v, c23v, c30v, c31v, c32v, c33v,
1167                                            c40v, c41v, c42v, c43v, c50v, c51v, c52v, c53v,
1168                                            c60v, c61v, c62v, c63v, c70v, c71v, c72v, c73v;
1169             libint2::simd::VectorAVXDouble c80v, c81v, c82v, c83v, c90v, c91v, c92v, c93v,
1170                                            ca0v, ca1v, ca2v, ca3v, cb0v, cb1v, cb2v, cb3v,
1171                                            cc0v, cc1v, cc2v, cc3v, cd0v, cd1v, cd2v, cd3v,
1172                                            ce0v, ce1v, ce2v, ce3v, cf0v, cf1v, cf2v, cf3v;
1173             c00v.load_aligned(c_tuint);                c01v.load_aligned((c_tuint+4));
1174             c02v.load_aligned(c_tuint+8);              c03v.load_aligned((c_tuint+12));
1175             libint2::simd::VectorAVXDouble t0vec(1, 1, 1, 1);
1176             libint2::simd::VectorAVXDouble t0 = t0vec * (c00v * u0vec + c01v * u1vec + c02v * u2vec + c03v * u3vec);
1177             c10v.load_aligned(c_tuint      +ORDERp1);  c11v.load_aligned((c_tuint+4)   +ORDERp1);
1178             c12v.load_aligned((c_tuint+8)  +ORDERp1);  c13v.load_aligned((c_tuint+12)  +ORDERp1);
1179             libint2::simd::VectorAVXDouble t1vec(t, t, t, t);
1180             libint2::simd::VectorAVXDouble t1 = t1vec * (c10v * u0vec + c11v * u1vec + c12v * u2vec + c13v * u3vec);
1181             c20v.load_aligned(c_tuint    +2*ORDERp1);  c21v.load_aligned((c_tuint+4) +2*ORDERp1);
1182             c22v.load_aligned((c_tuint+8)+2*ORDERp1);  c23v.load_aligned((c_tuint+12)+2*ORDERp1);
1183             libint2::simd::VectorAVXDouble t2vec(t2, t2, t2, t2);
1184             libint2::simd::VectorAVXDouble t2 = t2vec * (c20v * u0vec + c21v * u1vec + c22v * u2vec + c23v * u3vec);
1185             c30v.load_aligned(c_tuint    +3*ORDERp1);  c31v.load_aligned((c_tuint+4) +3*ORDERp1);
1186             c32v.load_aligned((c_tuint+8)+3*ORDERp1);  c33v.load_aligned((c_tuint+12)+3*ORDERp1);
1187             libint2::simd::VectorAVXDouble t3vec(t3, t3, t3, t3);
1188             libint2::simd::VectorAVXDouble t3 = t3vec * (c30v * u0vec + c31v * u1vec + c32v * u2vec + c33v * u3vec);
1189             libint2::simd::VectorAVXDouble t0123 = horizontal_add(t0, t1, t2, t3);
1190 
1191             c40v.load_aligned(c_tuint    +4*ORDERp1);  c41v.load_aligned((c_tuint+4) +4*ORDERp1);
1192             c42v.load_aligned((c_tuint+8)+4*ORDERp1);  c43v.load_aligned((c_tuint+12)+4*ORDERp1);
1193             libint2::simd::VectorAVXDouble t4vec(t4, t4, t4, t4);
1194             libint2::simd::VectorAVXDouble t4 = t4vec * (c40v * u0vec + c41v * u1vec + c42v * u2vec + c43v * u3vec);
1195             c50v.load_aligned(c_tuint    +5*ORDERp1);  c51v.load_aligned((c_tuint+4) +5*ORDERp1);
1196             c52v.load_aligned((c_tuint+8)+5*ORDERp1);  c53v.load_aligned((c_tuint+12)+5*ORDERp1);
1197            libint2::simd::VectorAVXDouble t5vec(t5, t5, t5, t5);
1198             libint2::simd::VectorAVXDouble t5 = t5vec * (c50v * u0vec + c51v * u1vec + c52v * u2vec + c53v * u3vec);
1199             c60v.load_aligned(c_tuint    +6*ORDERp1);  c61v.load_aligned((c_tuint+4) +6*ORDERp1);
1200             c62v.load_aligned((c_tuint+8)+6*ORDERp1);  c63v.load_aligned((c_tuint+12)+6*ORDERp1);
1201             libint2::simd::VectorAVXDouble t6vec(t6, t6, t6, t6);
1202             libint2::simd::VectorAVXDouble t6 = t6vec * (c60v * u0vec + c61v * u1vec + c62v * u2vec + c63v * u3vec);
1203             c70v.load_aligned(c_tuint    +7*ORDERp1);  c71v.load_aligned((c_tuint+4) +7*ORDERp1);
1204             c72v.load_aligned((c_tuint+8)+7*ORDERp1);  c73v.load_aligned((c_tuint+12)+7*ORDERp1);
1205             libint2::simd::VectorAVXDouble t7vec(t7, t7, t7, t7);
1206             libint2::simd::VectorAVXDouble t7 = t7vec * (c70v * u0vec + c71v * u1vec + c72v * u2vec + c73v * u3vec);
1207             libint2::simd::VectorAVXDouble t4567 = horizontal_add(t4, t5, t6, t7);
1208 
1209             c80v.load_aligned(c_tuint    +8*ORDERp1);  c81v.load_aligned((c_tuint+4) +8*ORDERp1);
1210             c82v.load_aligned((c_tuint+8)+8*ORDERp1);  c83v.load_aligned((c_tuint+12)+8*ORDERp1);
1211             libint2::simd::VectorAVXDouble t8vec(t8, t8, t8, t8);
1212             libint2::simd::VectorAVXDouble t8 = t8vec * (c80v * u0vec + c81v * u1vec + c82v * u2vec + c83v * u3vec);
1213             c90v.load_aligned(c_tuint    +9*ORDERp1);  c91v.load_aligned((c_tuint+4) +9*ORDERp1);
1214             c92v.load_aligned((c_tuint+8)+9*ORDERp1);  c93v.load_aligned((c_tuint+12)+9*ORDERp1);
1215             libint2::simd::VectorAVXDouble t9vec(t9, t9, t9, t9);
1216             libint2::simd::VectorAVXDouble t9 = t9vec * (c90v * u0vec + c91v * u1vec + c92v * u2vec + c93v * u3vec);
1217             ca0v.load_aligned(c_tuint    +10*ORDERp1);  ca1v.load_aligned((c_tuint+4) +10*ORDERp1);
1218             ca2v.load_aligned((c_tuint+8)+10*ORDERp1);  ca3v.load_aligned((c_tuint+12)+10*ORDERp1);
1219             libint2::simd::VectorAVXDouble tavec(t10, t10, t10, t10);
1220             libint2::simd::VectorAVXDouble ta = tavec * (ca0v * u0vec + ca1v * u1vec + ca2v * u2vec + ca3v * u3vec);
1221             cb0v.load_aligned(c_tuint    +11*ORDERp1);  cb1v.load_aligned((c_tuint+4) +11*ORDERp1);
1222             cb2v.load_aligned((c_tuint+8)+11*ORDERp1);  cb3v.load_aligned((c_tuint+12)+11*ORDERp1);
1223             libint2::simd::VectorAVXDouble tbvec(t11, t11, t11, t11);
1224             libint2::simd::VectorAVXDouble tb = tbvec * (cb0v * u0vec + cb1v * u1vec + cb2v * u2vec + cb3v * u3vec);
1225             libint2::simd::VectorAVXDouble t89ab = horizontal_add(t8, t9, ta, tb);
1226 
1227             cc0v.load_aligned(c_tuint    +12*ORDERp1);  cc1v.load_aligned((c_tuint+4) +12*ORDERp1);
1228             cc2v.load_aligned((c_tuint+8)+12*ORDERp1);  cc3v.load_aligned((c_tuint+12)+12*ORDERp1);
1229             libint2::simd::VectorAVXDouble tcvec(t12, t12, t12, t12);
1230             libint2::simd::VectorAVXDouble tc = tcvec * (cc0v * u0vec + cc1v * u1vec + cc2v * u2vec + cc3v * u3vec);
1231             cd0v.load_aligned(c_tuint    +13*ORDERp1);  cd1v.load_aligned((c_tuint+4) +13*ORDERp1);
1232             cd2v.load_aligned((c_tuint+8)+13*ORDERp1);  cd3v.load_aligned((c_tuint+12)+13*ORDERp1);
1233             libint2::simd::VectorAVXDouble tdvec(t13, t13, t13, t13);
1234             libint2::simd::VectorAVXDouble td = tdvec * (cd0v * u0vec + cd1v * u1vec + cd2v * u2vec + cd3v * u3vec);
1235             ce0v.load_aligned(c_tuint    +14*ORDERp1);  ce1v.load_aligned((c_tuint+4) +14*ORDERp1);
1236             ce2v.load_aligned((c_tuint+8)+14*ORDERp1);  ce3v.load_aligned((c_tuint+12)+14*ORDERp1);
1237             libint2::simd::VectorAVXDouble tevec(t14, t14, t14, t14);
1238             libint2::simd::VectorAVXDouble te = tevec * (ce0v * u0vec + ce1v * u1vec + ce2v * u2vec + ce3v * u3vec);
1239             cf0v.load_aligned(c_tuint    +15*ORDERp1);  cf1v.load_aligned((c_tuint+4)+15*ORDERp1);
1240             cf2v.load_aligned((c_tuint+8)+15*ORDERp1);  cf3v.load_aligned((c_tuint+4)+15*ORDERp1);
1241             libint2::simd::VectorAVXDouble tfvec(t15, t15, t15, t15);
1242             libint2::simd::VectorAVXDouble tf = tfvec * (cf0v * u0vec + cf1v * u1vec + cf2v * u2vec + cf3v * u3vec);
1243             libint2::simd::VectorAVXDouble tcdef = horizontal_add(tc, td, te, tf);
1244 
1245             auto tall = horizontal_add(t0123, t4567, t89ab, tcdef);
1246             const auto Gm = horizontal_add(tall);
1247 #else  // AVX
1248             // no AVX, use explicit loops (probably slow)
1249             double uvec[16];
1250             double tvec[16];
1251             uvec[0] = 1;
1252             tvec[0] = 1;
1253             for(int i=1; i!=16; ++i) {
1254               uvec[i] = uvec[i-1] * u;
1255               tvec[i] = tvec[i-1] * t;
1256             }
1257             double Gm = 0.0;
1258             for(int i=0, ij=0; i!=16; ++i) {
1259               for (int j = 0; j != 16; ++j, ++ij) {
1260                 Gm += c_tuint[ij] * tvec[i] * uvec[j];
1261               }
1262             }
1263 #endif // AVX
1264 
1265             if (!exp) {
1266               Gm_vec[m] = Gm;
1267             }
1268             else {
1269               if (m != -1) {
1270                 Gm_vec[m] = (Gmm1 - Gm) * zeta_over_two_rho;
1271               }
1272               Gmm1 = Gm;
1273             }
1274         }
1275 
1276         return;
1277       }
1278 
1279     private:
1280       unsigned int mmax_;
1281       Real precision_;
1282       ExpensiveNumbers<Real> numbers_;
1283       Real* c_ = nullptr;
1284 
init_tableTennoGmEval1285       void init_table() {
1286 
1287         // get memory
1288         void* result;
1289         int status = posix_memalign(&result, std::max(sizeof(Real), 32ul), (mmax_ - mmin_ + 1) * cheb_table_nintervals * ORDERp1 * ORDERp1 * sizeof(Real));
1290         if (status != 0) {
1291           if (status == EINVAL)
1292             throw std::logic_error(
1293               "TennoGmEval::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
1294           if (status == ENOMEM)
1295             throw std::bad_alloc();
1296           abort();  // should be unreachable
1297         }
1298         c_ = static_cast<Real*>(result);
1299 
1300         // copy contents of static table into c
1301         // need all intervals
1302         for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
1303           // but only values of m up to mmax
1304           std::copy(cheb_table[iv], cheb_table[iv]+((mmax_-mmin_)+1)*ORDERp1*ORDERp1, c_+(iv*((mmax_-mmin_)+1))*ORDERp1*ORDERp1);
1305         }
1306       }
1307 
1308 
1309   };  // TennoGmEval
1310 
1311 #if LIBINT2_CONSTEXPR_STATICS
1312   template <typename Real>
1313   constexpr
1314   double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1315 #else
1316   // clang needs an explicit specifalization declaration to avoid warning
1317 #  ifdef __clang__
1318   template <typename Real>
1319   double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1320 #  endif
1321 #endif
1322 
1323   template<typename Real, int k>
1324     struct GaussianGmEval;
1325 
1326   namespace detail {
1327 
1328     /// some evaluators need thread-local scratch, but most don't
1329     template <typename CoreEval> struct CoreEvalScratch {
1330       CoreEvalScratch(const CoreEvalScratch&) = default;
1331       CoreEvalScratch(CoreEvalScratch&&) = default;
CoreEvalScratchCoreEvalScratch1332       explicit CoreEvalScratch(int) { }
1333     };
1334     /// GaussianGmEval<Real,-1> needs extra scratch data
1335     template <typename Real>
1336     struct CoreEvalScratch<GaussianGmEval<Real, -1>> {
1337       std::vector<Real> Fm_;
1338       std::vector<Real> g_i;
1339       std::vector<Real> r_i;
1340       std::vector<Real> oorhog_i;
1341       CoreEvalScratch(const CoreEvalScratch&) = default;
1342       CoreEvalScratch(CoreEvalScratch&&) = default;
1343       explicit CoreEvalScratch(int mmax) {
1344         init(mmax);
1345       }
1346       private:
1347       void init(int mmax) {
1348         Fm_.resize(mmax+1);
1349         g_i.resize(mmax+1);
1350         r_i.resize(mmax+1);
1351         oorhog_i.resize(mmax+1);
1352         g_i[0] = 1.0;
1353         r_i[0] = 1.0;
1354       }
1355     };
1356   } // namespace libint2::detail
1357 
1358   //////////////////////////////////////////////////////////
1359   /// core integrals r12^k \sum_i \exp(- a_i r_12^2)
1360   //////////////////////////////////////////////////////////
1361 
1362   /**
1363    * Evaluates core integral \$ G_m(\rho, T) = \left( - \frac{\partial}{\partial T} \right)^n G_0(\rho,T) \f$,
1364    * \f$ G_0(\rho,T) = \int \exp(-\rho |\vec{r} - \vec{P} + \vec{Q}|^2) g(r) \, {\rm d}\vec{r} \f$
1365    * over a general contracted
1366    * Gaussian geminal \f$ g(r_{12}) = r_{12}^k \sum_i c_i \exp(- a_i r_{12}^2), \quad k = -1, 0, 2 \f$ .
1367    * The integrals are needed in R12/F12 methods with STG-nG correlation factors.
1368    * Specifically, for a correlation factor \f$ f(r_{12}) = \sum_i c_i \exp(- a_i r_{12}^2) \f$
1369    * integrals with the following kernels are needed:
1370    * <ul>
1371    *   <li> \f$ f(r_{12}) \f$  (k=0) </li>
1372    *   <li> \f$ f(r_{12}) / r_{12} \f$  (k=-1) </li>
1373    *   <li> \f$ f(r_{12})^2 \f$ (k=0, @sa GaussianGmEval::eval ) </li>
1374    *   <li> \f$ [f(r_{12}), [\hat{T}_1, f(r_{12})]] \f$ (k=2, @sa GaussianGmEval::eval ) </li>
1375    * </ul>
1376    *
1377    * N.B. ``Asymmetric'' kernels, \f$ f(r_{12}) g(r_{12}) \f$ and
1378    *   \f$ [f(r_{12}), [\hat{T}_1, g(r_{12})]] \f$, where f and g are two different geminals,
1379    *   can also be handled straightforwardly.
1380    *
1381    * \note for more details see DOI: 10.1039/b605188j
1382    */
1383   template<typename Real, int k>
1384   struct GaussianGmEval : private detail::CoreEvalScratch<GaussianGmEval<Real,k>> // N.B. empty-base optimization
1385   {
1386 #ifndef LIBINT_USER_DEFINED_REAL
1387       using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1388 #else
1389       using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1390 #endif
1391 
1392       /**
1393        * @param[in] mmax the evaluator will be used to compute Gm(T) for 0 <= m <= mmax
1394        */
1395       GaussianGmEval(int mmax, Real precision) :
1396           detail::CoreEvalScratch<GaussianGmEval<Real, k>>(mmax), mmax_(mmax),
1397           precision_(precision), fm_eval_(),
1398           numbers_(-1,-1,mmax) {
1399         assert(k == -1 || k == 0 || k == 2);
1400         // for k=-1 need to evaluate the Boys function
1401         if (k == -1) {
1402           fm_eval_ = FmEvalType::instance(mmax_, precision_);
1403         }
1404       }
1405 
1406       ~GaussianGmEval() {
1407       }
1408 
1409       /// Singleton interface allows to manage the lone instance;
1410       /// adjusts max m and precision values as needed in thread-safe fashion
1411       static std::shared_ptr<GaussianGmEval> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1412         assert(mmax >= 0);
1413         assert(precision >= 0);
1414         // thread-safe per C++11 standard [6.7.4]
1415         static auto instance_ = std::make_shared<GaussianGmEval>(mmax, precision);
1416 
1417         while (instance_->max_m() < mmax || instance_->precision() > precision) {
1418           static std::mutex mtx;
1419           std::lock_guard<std::mutex> lck(mtx);
1420           if (instance_->max_m() < mmax || instance_->precision() > precision) {
1421             auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision);
1422             instance_ = new_instance;
1423           }
1424         }
1425 
1426         return instance_;
1427       }
1428 
1429       /// @return the maximum value of m for which the \f$ G_m(\rho, T) \f$ can be computed with this object
1430       int max_m() const { return mmax_; }
1431       /// @return the precision with which this object can compute the Boys function
1432       Real precision() const { return precision_; }
1433 
1434       /** computes \f$ G_m(\rho, T) \f$ using downward recursion.
1435        *
1436        * @warning NOT reentrant if \c k == -1 and C++11 is not available
1437        *
1438        * @param[out] Gm array to be filled in with the \f$ Gm(\rho, T) \f$ values, must be at least mmax+1 elements long
1439        * @param[in] rho
1440        * @param[in] T
1441        * @param[in] mmax mmax the maximum value of m for which Boys function will be computed;
1442        *                 it must be <= the value returned by max_m() (this is not checked)
1443        * @param[in] geminal the Gaussian geminal for which the core integral \f$ Gm(\rho, T) \f$ is computed; a contracted Gaussian geminal is represented as a vector of pairs, each of which specifies the exponent (first) and contraction coefficient (second) of the primitive Gaussian geminal
1444        * @param[in] scr if \c k ==-1 and need this to be reentrant, must provide ptr to
1445        *                the per-thread \c libint2::detail::CoreEvalScratch<GaussianGmEval<Real,-1>> object;
1446        *                no need to specify \c scr otherwise
1447        */
1448       template <typename AnyReal>
1449       void eval(Real* Gm, Real rho, Real T, size_t mmax,
1450                 const std::vector<std::pair<AnyReal, AnyReal> >& geminal,
1451                 void* scr = 0) {
1452 
1453         std::fill(Gm, Gm+mmax+1, Real(0));
1454 
1455         const auto sqrt_rho = sqrt(rho);
1456         const auto oo_sqrt_rho = 1/sqrt_rho;
1457         if (k == -1) {
1458           void* _scr = (scr == 0) ? this : scr;
1459           auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1460           for(int i=1; i<=mmax; i++) {
1461             scratch.r_i[i] = scratch.r_i[i-1] * rho;
1462           }
1463         }
1464 
1465         typedef typename std::vector<std::pair<AnyReal, AnyReal> >::const_iterator citer;
1466         const citer gend = geminal.end();
1467         for(citer i=geminal.begin(); i!= gend; ++i) {
1468 
1469           const auto gamma = i->first;
1470           const auto gcoef = i->second;
1471           const auto rhog = rho + gamma;
1472           const auto oorhog = 1/rhog;
1473 
1474           const auto gorg = gamma * oorhog;
1475           const auto rorg = rho * oorhog;
1476           const auto sqrt_rho_org = sqrt_rho * oorhog;
1477           const auto sqrt_rhog = sqrt(rhog);
1478           const auto sqrt_rorg = sqrt_rho_org * sqrt_rhog;
1479 
1480           /// (ss|g12|ss)
1481           constexpr Real const_SQRTPI_2(0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2 */
1482           const auto SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg * sqrt_rorg * exp(-gorg*T);
1483 
1484           if (k == -1) {
1485             void* _scr = (scr == 0) ? this : scr;
1486             auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1487 
1488             const auto rorgT = rorg * T;
1489             fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax);
1490 
1491 #if 1
1492             constexpr Real const_2_SQRTPI(1.12837916709551257389615890312154517);   /* 2/sqrt(pi)     */
1493             const auto pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS;
1494             scratch.oorhog_i[0] = pfac;
1495             for(int i=1; i<=mmax; i++) {
1496               scratch.g_i[i] = scratch.g_i[i-1] * gamma;
1497               scratch.oorhog_i[i] = scratch.oorhog_i[i-1] * oorhog;
1498             }
1499             for(int m=0; m<=mmax; m++) {
1500               Real ssss = 0.0;
1501               Real* bcm = numbers_.bc[m];
1502               for(int n=0; n<=m; n++) {
1503                 ssss += bcm[n] * scratch.r_i[n] * scratch.g_i[m-n] * scratch.Fm_[n];
1504               }
1505               Gm[m] += ssss * scratch.oorhog_i[m];
1506             }
1507 #endif
1508           }
1509 
1510           if (k == 0) {
1511             auto ss_oper_ss_m = SS_K0G12_SS;
1512             Gm[0] += ss_oper_ss_m;
1513             for(int m=1; m<=mmax; ++m) {
1514               ss_oper_ss_m *= gorg;
1515               Gm[m] += ss_oper_ss_m;
1516             }
1517           }
1518 
1519           if (k == 2) {
1520 
1521             /// (ss|g12*r12^2|ss)
1522             const auto rorgT = rorg * T;
1523             const auto SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog);
1524             const auto SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog);
1525 
1526             auto SS_K2G12_SS_gorg_m = SS_K2G12_SS_0 ;
1527             auto SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1;
1528             Gm[0] += SS_K2G12_SS_gorg_m;
1529             for(int m=1; m<=mmax; ++m) {
1530               SS_K2G12_SS_gorg_m *= gorg;
1531               Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1;
1532               SS_K2G12_SS_gorg_m1 *= gorg;
1533             }
1534           }
1535 
1536         }
1537 
1538       }
1539 
1540     private:
1541       int mmax_;
1542       Real precision_; //< absolute precision
1543       std::shared_ptr<const FmEvalType> fm_eval_;
1544 
1545       ExpensiveNumbers<Real> numbers_;
1546   };
1547 
1548   template <typename GmEvalFunction>
1549   struct GenericGmEval : private GmEvalFunction {
1550 
1551     typedef typename GmEvalFunction::value_type Real;
1552 
1553       GenericGmEval(int mmax, Real precision) : GmEvalFunction(mmax, precision),
1554           mmax_(mmax), precision_(precision) {}
1555 
1556       static std::shared_ptr<const GenericGmEval> instance(int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1557         return std::make_shared<const GenericGmEval>(mmax, precision);
1558       }
1559 
1560       template <typename Real, typename... ExtraArgs>
1561       void eval(Real* Gm, Real rho, Real T, int mmax, ExtraArgs... args) const {
1562         assert(mmax <= mmax_);
1563         (GmEvalFunction(*this))(Gm, rho, T, mmax, std::forward<ExtraArgs>(args)...);
1564       }
1565 
1566       /// @return the maximum value of m for which the \f$ G_m(\rho, T) \f$ can be computed with this object
1567       int max_m() const { return mmax_; }
1568       /// @return the precision with which this object can compute the Boys function
1569       Real precision() const { return precision_; }
1570 
1571     private:
1572       int mmax_;
1573       Real precision_;
1574   };
1575 
1576   // these Gm engines need extra scratch data
1577   namespace os_core_ints {
1578   template <typename Real, int K> struct r12_xx_K_gm_eval;
1579   template <typename Real> struct erfc_coulomb_gm_eval;
1580   }
1581 
1582   namespace detail {
1583   /// r12_xx_K_gm_eval<1> needs extra scratch data
1584   template <typename Real>
1585   struct CoreEvalScratch<os_core_ints::r12_xx_K_gm_eval<Real, 1>> {
1586     std::vector<Real> Fm_;
1587     CoreEvalScratch(const CoreEvalScratch&) = default;
1588     CoreEvalScratch(CoreEvalScratch&&) = default;
1589     // need to store Fm(T) for m = 0 .. mmax+1
1590     explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+2); }
1591   };
1592   /// erfc_coulomb_gm_eval needs extra scratch data
1593   template <typename Real>
1594   struct CoreEvalScratch<os_core_ints::erfc_coulomb_gm_eval<Real>> {
1595     std::vector<Real> Fm_;
1596     CoreEvalScratch(const CoreEvalScratch&) = default;
1597     CoreEvalScratch(CoreEvalScratch&&) = default;
1598     // need to store Fm(T) for m = 0 .. mmax
1599     explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+1); }
1600   };
1601   }
1602 
1603   /// Obara-Saika core ints code
1604   namespace os_core_ints {
1605 
1606     /// core integral evaluator delta function kernels
1607   template <typename Real>
1608   struct delta_gm_eval {
1609     typedef Real value_type;
1610 
1611     delta_gm_eval(unsigned int, Real) {}
1612     void operator()(Real* Gm, Real rho, Real T, int mmax) const {
1613       constexpr static auto one_over_two_pi = 1.0 / (2.0 * M_PI);
1614       const auto G0 = exp(-T) * rho * one_over_two_pi;
1615       std::fill(Gm, Gm + mmax + 1, G0);
1616     }
1617   };
1618 
1619   /// core integral evaluator for \f$ r_{12}^K \f$ kernel
1620   /// @tparam K currently supported \c K=1 (use Boys engine directly for \c K=-1)
1621   /// @note need extra scratch for Boys function values when \c K==1,
1622   ///       the Gm vector is not long enough for scratch
1623 
1624   template <typename Real, int K>
1625   struct r12_xx_K_gm_eval;
1626 
1627   template <typename Real>
1628   struct r12_xx_K_gm_eval<Real, 1>
1629       : private detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> {
1630     typedef detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> base_type;
1631     typedef Real value_type;
1632 
1633 #ifndef LIBINT_USER_DEFINED_REAL
1634     using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1635 #else
1636     using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1637 #endif
1638 
1639     r12_xx_K_gm_eval(unsigned int mmax, Real precision)
1640         : base_type(mmax) {
1641       fm_eval_ = FmEvalType::instance(mmax + 1, precision);
1642     }
1643     void operator()(Real* Gm, Real rho, Real T, int mmax) {
1644       fm_eval_->eval(&base_type::Fm_[0], T, mmax + 1);
1645       auto T_plus_m_plus_one = T + 1.0;
1646       Gm[0] = T_plus_m_plus_one * base_type::Fm_[0] - T * base_type::Fm_[1];
1647       auto minus_m = -1.0;
1648       T_plus_m_plus_one += 1.0;
1649       for (auto m = 1; m <= mmax;
1650            ++m, minus_m -= 1.0, T_plus_m_plus_one += 1.0) {
1651         Gm[m] =
1652             minus_m * base_type::Fm_[m - 1] + T_plus_m_plus_one * base_type::Fm_[m] - T * base_type::Fm_[m + 1];
1653       }
1654     }
1655 
1656    private:
1657     std::shared_ptr<const FmEvalType> fm_eval_;  // need for odd K
1658   };
1659 
1660   /// core integral evaluator for \f$ \mathrm{erf}(\omega r) / r \f$ kernel
1661   template <typename Real>
1662   struct erf_coulomb_gm_eval {
1663     typedef Real value_type;
1664 
1665 #ifndef LIBINT_USER_DEFINED_REAL
1666     using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1667 #else
1668     using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1669 #endif
1670 
1671     erf_coulomb_gm_eval(unsigned int mmax, Real precision) {
1672       fm_eval_ = FmEvalType::instance(mmax, precision);
1673     }
1674     void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) const {
1675       if (omega > 0) {
1676         auto omega2 = omega * omega;
1677         auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1678         fm_eval_->eval(Gm, T * omega2_over_omega2_plus_rho,
1679                        mmax);
1680 
1681         using std::sqrt;
1682         auto ooversqrto2prho_exp_2mplus1 =
1683             sqrt(omega2_over_omega2_plus_rho);
1684         for (auto m = 0; m <= mmax;
1685              ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1686           Gm[m] *= ooversqrto2prho_exp_2mplus1;
1687         }
1688       }
1689       else {
1690         std::fill(Gm, Gm+mmax+1, Real{0});
1691       }
1692     }
1693 
1694      private:
1695       std::shared_ptr<const FmEvalType> fm_eval_;  // need for odd K
1696   };
1697 
1698   /// core integral evaluator for \f$ \mathrm{erfc}(\omega r) / r \f$ kernel
1699   /// @note need extra scratch for Boys function values,
1700   ///       since need to call Boys engine twice
1701   template <typename Real>
1702   struct erfc_coulomb_gm_eval : private
1703   detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> {
1704     typedef detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> base_type;
1705     typedef Real value_type;
1706 
1707     #ifndef LIBINT_USER_DEFINED_REAL
1708     using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1709 #else
1710     using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1711 #endif
1712 
1713     erfc_coulomb_gm_eval(unsigned int mmax, Real precision)
1714         : base_type(mmax) {
1715       fm_eval_ = FmEvalType::instance(mmax, precision);
1716     }
1717     void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) {
1718       fm_eval_->eval(&base_type::Fm_[0], T, mmax);
1719       std::copy(base_type::Fm_.cbegin(), base_type::Fm_.cbegin() + mmax + 1, Gm);
1720       if (omega > 0) {
1721         auto omega2 = omega * omega;
1722         auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1723         fm_eval_->eval(&base_type::Fm_[0], T * omega2_over_omega2_plus_rho,
1724                        mmax);
1725 
1726         using std::sqrt;
1727         auto ooversqrto2prho_exp_2mplus1 =
1728             sqrt(omega2_over_omega2_plus_rho);
1729         for (auto m = 0; m <= mmax;
1730              ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1731           Gm[m] -= ooversqrto2prho_exp_2mplus1 * base_type::Fm_[m];
1732         }
1733       }
1734     }
1735 
1736      private:
1737       std::shared_ptr<const FmEvalType> fm_eval_;  // need for odd K
1738   };
1739 
1740   }  // namespace os_core_ints
1741 
1742   /*
1743    *  Slater geminal fitting is available only if have LAPACK
1744    */
1745 #if HAVE_LAPACK
1746   /*
1747   f[x_] := - Exp[-\[Zeta] x] / \[Zeta];
1748 
1749   ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}];
1750   */
1751   template <typename Real>
1752   Real
1753   fstg(Real zeta,
1754        Real x) {
1755     return -std::exp(-zeta*x)/zeta;
1756   }
1757 
1758   template <typename Real>
1759   Real
1760   fngtg(const std::vector<Real>& cc,
1761         const std::vector<Real>& aa,
1762         Real x) {
1763     Real value = 0.0;
1764     const Real x2 = x * x;
1765     const unsigned int n = cc.size();
1766     for(unsigned int i=0; i<n; ++i)
1767       value += cc[i] * std::exp(- aa[i] * x2);
1768     return value;
1769   }
1770 
1771   // --- weighting functions ---
1772   // L2 error is weighted by ww(x)
1773   // hence error is weighted by sqrt(ww(x))
1774   template <typename Real>
1775   Real
1776   wwtewklopper(Real x) {
1777     const Real x2 = x * x;
1778     return x2 * std::exp(-2 * x2);
1779   }
1780   template <typename Real>
1781   Real
1782   wwcusp(Real x) {
1783     const Real x2 = x * x;
1784     const Real x6 = x2 * x2 * x2;
1785     return std::exp(-0.005 * x6);
1786   }
1787   // default is Tew-Klopper
1788   template <typename Real>
1789   Real
1790   ww(Real x) {
1791     //return wwtewklopper(x);
1792     return wwcusp(x);
1793   }
1794 
1795   template <typename Real>
1796   Real
1797   norm(const std::vector<Real>& vec) {
1798     Real value = 0.0;
1799     const unsigned int n = vec.size();
1800     for(unsigned int i=0; i<n; ++i)
1801       value += vec[i] * vec[i];
1802     return value;
1803   }
1804 
1805   template <typename Real>
1806   void LinearSolveDamped(const std::vector<Real>& A,
1807                          const std::vector<Real>& b,
1808                          Real lambda,
1809                          std::vector<Real>& x) {
1810     const size_t n = b.size();
1811     std::vector<Real> Acopy(A);
1812     for(size_t m=0; m<n; ++m) Acopy[m*n + m]  *= (1 + lambda);
1813     std::vector<Real> e(b);
1814 
1815     //int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0], &e[0], n );
1816     {
1817       std::vector<int> ipiv(n);
1818       int n = b.size();
1819       int one = 1;
1820       int info;
1821       dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info);
1822       assert (info == 0);
1823     }
1824 
1825     x = e;
1826   }
1827 
1828   /**
1829    * computes a least-squares fit of \f$ -exp(-\zeta r_{12})/\zeta = \sum_{i=1}^n c_i exp(-a_i r_{12}^2) \f$
1830    * on \f$ r_{12} \in [0, x_{\rm max}] \f$ discretized to npts.
1831    * @param[in] n
1832    * @param[in] zeta
1833    * @param[out] geminal
1834    * @param[in] xmin
1835    * @param[in] xmax
1836    * @param[in] npts
1837    */
1838   template <typename Real>
1839   void stg_ng_fit(unsigned int n,
1840                  Real zeta,
1841                  std::vector< std::pair<Real, Real> >& geminal,
1842                  Real xmin = 0.0,
1843                  Real xmax = 10.0,
1844                  unsigned int npts = 1001) {
1845 
1846     // initial guess
1847     std::vector<Real> cc(n, 1.0); // coefficients
1848     std::vector<Real> aa(n); // exponents
1849     for(unsigned int i=0; i<n; ++i)
1850       aa[i] = std::pow(3.0, (i + 2 - (n + 1)/2.0));
1851 
1852     // first rescale cc for ff[x] to match the norm of f[x]
1853     Real ffnormfac = 0.0;
1854     using std::sqrt;
1855     for(unsigned int i=0; i<n; ++i)
1856       for(unsigned int j=0; j<n; ++j)
1857         ffnormfac += cc[i] * cc[j]/sqrt(aa[i] + aa[j]);
1858     const Real Nf = sqrt(2.0 * zeta) * zeta;
1859     const Real Nff = sqrt(2.0) / (sqrt(ffnormfac) *
1860         sqrt(sqrt(M_PI)));
1861     for(unsigned int i=0; i<n; ++i) cc[i] *= -Nff/Nf;
1862 
1863     Real lambda0 = 1000; // damping factor is initially set to 1000, eventually should end up at 0
1864     const Real nu = 3.0; // increase/decrease the damping factor scale it by this
1865     const Real epsilon = 1e-15; // convergence
1866     const unsigned int maxniter = 200;
1867 
1868     // grid points on which we will fit
1869     std::vector<Real> xi(npts);
1870     for(unsigned int i=0; i<npts; ++i) xi[i] = xmin + (xmax - xmin)*i/(npts - 1);
1871 
1872     std::vector<Real> err(npts);
1873 
1874     const size_t nparams = 2*n; // params = expansion coefficients + gaussian exponents
1875     std::vector<Real> J( npts * nparams );
1876     std::vector<Real> delta(nparams);
1877 
1878 //    std::cout << "iteration 0" << std::endl;
1879 //    for(unsigned int i=0; i<n; ++i)
1880 //      std::cout << cc[i] << " " << aa[i] << std::endl;
1881 
1882     Real errnormI;
1883     Real errnormIm1 = 1e3;
1884     bool converged = false;
1885     unsigned int iter = 0;
1886     while (!converged && iter < maxniter) {
1887 //      std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu << std::endl;
1888 
1889         for(unsigned int i=0; i<npts; ++i) {
1890           const Real x = xi[i];
1891           err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * sqrt(ww(x));
1892         }
1893         errnormI = norm(err)/sqrt((Real)npts);
1894 
1895 //        std::cout << "|err|=" << errnormI << std::endl;
1896         converged = std::abs((errnormI - errnormIm1)/errnormIm1) <= epsilon;
1897         if (converged) break;
1898         errnormIm1 = errnormI;
1899 
1900         for(unsigned int i=0; i<npts; ++i) {
1901           const Real x2 = xi[i] * xi[i];
1902           const Real sqrt_ww_x = sqrt(ww(xi[i]));
1903           const unsigned int ioffset = i * nparams;
1904           for(unsigned int j=0; j<n; ++j)
1905             J[ioffset+j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x;
1906           const unsigned int ioffsetn = ioffset+n;
1907           for(unsigned int j=0; j<n; ++j)
1908             J[ioffsetn+j] = -  sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2);
1909         }
1910 
1911         std::vector<Real> A( nparams * nparams);
1912         for(size_t r=0, rc=0; r<nparams; ++r) {
1913           for(size_t c=0; c<nparams; ++c, ++rc) {
1914             double Arc = 0.0;
1915             for(size_t i=0, ir=r, ic=c; i<npts; ++i, ir+=nparams, ic+=nparams)
1916               Arc += J[ir] * J[ic];
1917             A[rc] = Arc;
1918           }
1919         }
1920 
1921         std::vector<Real> b( nparams );
1922         for(size_t r=0; r<nparams; ++r) {
1923           Real br = 0.0;
1924           for(size_t i=0, ir=r; i<npts; ++i, ir+=nparams)
1925             br += J[ir] * err[i];
1926           b[r] = br;
1927         }
1928 
1929         // try decreasing damping first
1930         // if not successful try increasing damping until it results in a decrease in the error
1931         lambda0 /= nu;
1932         for(int l=-1; l<1000; ++l) {
1933 
1934           LinearSolveDamped(A, b, lambda0, delta );
1935 
1936           std::vector<double> cc_0(cc); for(unsigned int i=0; i<n; ++i) cc_0[i] += delta[i];
1937           std::vector<double> aa_0(aa); for(unsigned int i=0; i<n; ++i) aa_0[i] += delta[i+n];
1938 
1939           // if any of the exponents are negative the step is too large and need to increase damping
1940           bool step_too_large = false;
1941           for(unsigned int i=0; i<n; ++i)
1942             if (aa_0[i] < 0.0) {
1943               step_too_large = true;
1944               break;
1945             }
1946           if (!step_too_large) {
1947             std::vector<double> err_0(npts);
1948             for(unsigned int i=0; i<npts; ++i) {
1949               const double x = xi[i];
1950               err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * sqrt(ww(x));
1951             }
1952             const double errnorm_0 = norm(err_0)/sqrt((double)npts);
1953             if (errnorm_0 < errnormI) {
1954               cc = cc_0;
1955               aa = aa_0;
1956               break;
1957             }
1958             else // step lead to increase of the error -- try dampening a bit more
1959               lambda0 *= nu;
1960           }
1961           else // too large of a step
1962             lambda0 *= nu;
1963         } // done adjusting the damping factor
1964 
1965       } // end of iterative minimization
1966 
1967       // if reached max # of iterations throw if the error is too terrible
1968       assert(not (iter == maxniter && errnormI > 1e-10));
1969 
1970       for(unsigned int i=0; i<n; ++i)
1971         geminal[i] = std::make_pair(aa[i], cc[i]);
1972     }
1973 #endif
1974 
1975 } // end of namespace libint2
1976 
1977 #endif // C++ only
1978 #endif // header guard
1979