1 /* 2 * Copyright (C) 2004-2021 Edward F. Valeev 3 * 4 * This file is part of Libint. 5 * 6 * Libint is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * Libint is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public License 17 * along with Libint. If not, see <http://www.gnu.org/licenses/>. 18 * 19 */ 20 21 #ifndef _libint2_src_lib_libint_boys_h_ 22 #define _libint2_src_lib_libint_boys_h_ 23 24 #if defined(__cplusplus) 25 26 #include <iostream> 27 #include <cstdlib> 28 #include <cmath> 29 #include <stdexcept> 30 #include <libint2/util/vector.h> 31 #include <cassert> 32 #include <vector> 33 #include <algorithm> 34 #include <limits> 35 #include <mutex> 36 #include <type_traits> 37 #include <memory> 38 39 // from now on at least C++11 is required by default 40 #include <libint2/util/cxxstd.h> 41 #if LIBINT2_CPLUSPLUS_STD < 2011 42 # error "Libint2 C++ API requires C++11 support" 43 #endif 44 45 #include <libint2/boys_fwd.h> 46 #include <libint2/numeric.h> 47 #include <libint2/initialize.h> 48 49 #if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later 50 extern "C" void dgesv_(const int* n, 51 const int* nrhs, double* A, const int* lda, 52 int* ipiv, double* b, const int* ldb, 53 int* info); 54 #endif 55 56 namespace libint2 { 57 58 /// holds tables of expensive quantities 59 template<typename Real> 60 class ExpensiveNumbers { 61 public: 62 ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) { 63 if (ifac >= 0) { 64 fac.resize(ifac + 1); 65 fac[0] = 1.0; 66 for (int i = 1; i <= ifac; i++) { 67 fac[i] = i * fac[i - 1]; 68 } 69 } 70 71 if (idf >= 0) { 72 df.resize(idf + 1); 73 /* df[i] gives (i-1)!!, so that (-1)!! is defined... */ 74 df[0] = 1.0; 75 if (idf >= 1) 76 df[1] = 1.0; 77 if (idf >= 2) 78 df[2] = 1.0; 79 for (int i = 3; i <= idf; i++) { 80 df[i] = (i - 1) * df[i - 2]; 81 } 82 } 83 84 if (ibc >= 0) { 85 bc_.resize((ibc+1)*(ibc+1)); 86 std::fill(bc_.begin(), bc_.end(), Real(0)); 87 bc.resize(ibc+1); 88 bc[0] = &bc_[0]; 89 for(int i=1; i<=ibc; ++i) 90 bc[i] = bc[i-1] + (ibc+1); 91 92 for(int i=0; i<=ibc; i++) 93 bc[i][0] = 1.0; 94 for(int i=0; i<=ibc; i++) 95 for(int j=1; j<=i; ++j) 96 bc[i][j] = bc[i][j-1] * Real(i-j+1) / Real(j); 97 } 98 99 for (int i = 0; i < 128; i++) { 100 twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0)); 101 ihalf[i] = Real(i) - Real(0.5); 102 } 103 104 } 105 ~ExpensiveNumbers()106 ~ExpensiveNumbers() { 107 } 108 109 std::vector<Real> fac; 110 std::vector<Real> df; 111 std::vector<Real*> bc; 112 113 // these quantitites are needed with indices <= mmax 114 // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis functions 115 // but need higher values for Yukawa integrals ... 116 Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */ 117 Real ihalf[128]; /* i - 0.5, needed for upward recursion */ 118 119 private: 120 std::vector<Real> bc_; 121 }; 122 123 /** Computes the Boys function, \f$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \f$, 124 * using single algorithm (asymptotic expansion). Slow for the sake of precision control. 125 * Useful in two cases: 126 * <ul> 127 * <li> for reference purposes, if \c Real supports high/arbitrary precision, and </li> 128 * <li> for moderate values of \f$ T \f$, if \c Real is a low-precision floating-point type. 129 * N.B. FmEval_Reference2 , which can compute for all practical values of \f$ T \f$ and \f$ m \f$, is recommended 130 * with standard \c Real types (\c double and \c float). </li> 131 * </ul> 132 * 133 * \note Precision is controlled heuristically, i.e. cannot be guaranteed mathematically; 134 * will stop if absolute precision is reached, or precision of \c Real is exhausted. 135 * It is important that \c std::numeric_limits<Real> is defined appropriately. 136 * 137 * @tparam Real the type to use for all floating-point computations. 138 * Must be able to compute logarithm and exponential, i.e. 139 * log(x) and exp(x), where x is Real, must be valid expressions. 140 */ 141 template<typename Real> 142 struct FmEval_Reference { 143 instanceFmEval_Reference144 static std::shared_ptr<const FmEval_Reference> instance(int /* mmax */, Real /* precision */) { 145 146 // thread-safe per C++11 standard [6.7.4] 147 static auto instance_ = std::make_shared<const FmEval_Reference>(); 148 149 return instance_; 150 } 151 152 /// computes a single value of \f$ F_m(T) \f$ using MacLaurin series to full precision of @c Real evalFmEval_Reference153 static Real eval(Real T, size_t m) { 154 assert(m < 100); 155 static const Real T_crit = std::numeric_limits<Real>::is_bounded ? -log( std::numeric_limits<Real>::min() * 100.5 / 2. ) : Real(0) ; 156 if (std::numeric_limits<Real>::is_bounded && T > T_crit) 157 throw std::overflow_error("FmEval_Reference<Real>::eval: Real lacks precision for the given value of argument T"); 158 static const Real half = Real(1)/2; 159 Real denom = (m + half); 160 using std::exp; 161 Real term = exp(-T) / (2 * denom); 162 Real old_term = 0; 163 Real sum = term; 164 const Real epsilon = get_epsilon(T); 165 const Real epsilon_divided_10 = epsilon / 10; 166 do { 167 denom += 1; 168 old_term = term; 169 term = old_term * T / denom; 170 sum += term; 171 //rel_error = term / sum , hence iterate until rel_error = epsilon 172 // however, must ensure that contributions are decreasing to ensure that omitted contributions are smaller than epsilon 173 } while (term > sum * epsilon_divided_10 || old_term < term); 174 175 return sum; 176 } 177 178 /// fills up an array of Fm(T) for m in [0,mmax] 179 /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long 180 /// @param[in] T the Boys function argument 181 /// @param[in] mmax the maximum value of m for which Boys function will be computed; evalFmEval_Reference182 static void eval(Real* Fm, Real T, size_t mmax) { 183 184 // evaluate for mmax using MacLaurin series 185 // it converges fastest for the largest m -> use it to compute Fmmax(T) 186 // see JPC 94, 5564 (1990). 187 for(size_t m=0; m<=mmax; ++m) 188 Fm[m] = eval(T, m); 189 return; 190 } 191 192 }; 193 194 /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$, 195 * using multi-algorithm approach (upward recursion for T>=117, and asymptotic summation for T<117). 196 * This is slow and should be used for reference purposes, e.g. computing the interpolation tables. 197 * Precision is not always guaranteed as it is limited by the precision of \c Real type. 198 * When \c Real is \c double, can maintain absolute precision of epsilon for up to m=40. 199 * 200 * @tparam Real the type to use for all floating-point computations. 201 * Must be able to compute logarithm, exponential, square root, and error function, i.e. 202 * log(x), exp(x), sqrt(x), and erf(x), where x is Real, must be valid expressions. 203 */ 204 template<typename Real> 205 struct FmEval_Reference2 { 206 instanceFmEval_Reference2207 static std::shared_ptr<const FmEval_Reference2> instance(int /* mmax */, Real /* precision */) { 208 209 // thread-safe per C++11 standard [6.7.4] 210 static auto instance_ = std::make_shared<const FmEval_Reference2>(); 211 212 return instance_; 213 } 214 215 /// fills up an array of Fm(T) for m in [0,mmax] 216 /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long 217 /// @param[in] t the Boys function argument 218 /// @param[in] mmax the maximum value of m for which Boys function will be computed; evalFmEval_Reference2219 static void eval(Real* Fm, Real t, size_t mmax) { 220 221 if (t < Real(117)) { 222 FmEval_Reference<Real>::eval(Fm,t,mmax); 223 } 224 else { 225 const Real two_over_sqrt_pi{1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727}; 226 const Real K = 1/two_over_sqrt_pi; 227 228 auto t2 = 2*t; 229 auto et = exp(-t); 230 auto sqrt_t = sqrt(t); 231 Fm[0] = K*erf(sqrt_t)/sqrt_t; 232 if (mmax > 0) 233 for(size_t m=0; m<=mmax-1; m++) { 234 Fm[m+1] = ((2*m + 1)*Fm[m] - et)/(t2); 235 } 236 } 237 } 238 239 }; 240 241 /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$, 242 * using 7-th order Chebyshev interpolation. 243 */ 244 template <typename Real = double> 245 class FmEval_Chebyshev7 { 246 247 #include <libint2/boys_cheb7.h> 248 249 static_assert(std::is_same<Real,double>::value, "FmEval_Chebyshev7 only supports double as the real type"); 250 251 static constexpr int ORDER = interpolation_order; //!, interpolation order 252 static constexpr int ORDERp1 = ORDER+1; //!< ORDER + 1 253 254 static constexpr Real T_crit = cheb_table_tmax; //!< critical value of T above which safe to use upward recusion 255 static constexpr Real delta = cheb_table_delta; //!< interval size 256 static constexpr Real one_over_delta = 1/delta; //! 1/delta 257 258 int mmax; //!< the maximum m that is tabulated 259 ExpensiveNumbers<double> numbers_; 260 Real *c; /* the Chebyshev coefficients table, T_crit*one_over_delta by mmax*ORDERp1 */ 261 262 public: 263 /// \param m_max maximum value of the Boys function index; set to -1 to skip initialization 264 /// \param precision the desired relative precision 265 /// \throw std::invalid_argument if \c m_max is greater than \c cheb_table_mmax (see boys_cheb7.h) 266 /// \throw std::invalid_argument if \c precision is smaller than std::numeric_limits<double>::epsilon() 267 FmEval_Chebyshev7(int m_max, double precision = std::numeric_limits<double>::epsilon()) : mmax(m_max)268 mmax(m_max), numbers_(14) { 269 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 270 # if !defined(__AVX__) && defined(NDEBUG) 271 if (libint2::verbose()) { 272 static bool printed_performance_warning = false; 273 if (!printed_performance_warning) { 274 libint2::verbose_stream() 275 << "libint2::FmEval_Chebyshev7 on x86(-64) platforms needs AVX support for best performance" 276 << std::endl; 277 printed_performance_warning = true; 278 } 279 } 280 # endif 281 #endif 282 if (precision < std::numeric_limits<double>::epsilon()) 283 throw std::invalid_argument(std::string("FmEval_Chebyshev7 does not support precision smaller than ") + std::to_string(std::numeric_limits<double>::epsilon())); 284 if (mmax > cheb_table_mmax) 285 throw std::invalid_argument( 286 "FmEval_Chebyshev7::init() : requested mmax exceeds the " 287 "hard-coded mmax"); 288 if (m_max >= 0) 289 init_table(); 290 } ~FmEval_Chebyshev7()291 ~FmEval_Chebyshev7() { 292 if (mmax >= 0) { 293 free(c); 294 } 295 } 296 297 /// Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe fashion 298 static std::shared_ptr<const FmEval_Chebyshev7> instance(int m_max, double = 0.0) { 299 300 assert(m_max >= 0); 301 // thread-safe per C++11 standard [6.7.4] 302 static auto instance_ = std::make_shared<const FmEval_Chebyshev7>(m_max); 303 304 while (instance_->max_m() < m_max) { 305 static std::mutex mtx; 306 std::lock_guard<std::mutex> lck(mtx); 307 if (instance_->max_m() < m_max) { 308 auto new_instance = std::make_shared<const FmEval_Chebyshev7>(m_max); 309 instance_ = new_instance; 310 } 311 } 312 313 return instance_; 314 } 315 316 /// @return the maximum value of m for which the Boys function can be computed with this object max_m()317 int max_m() const { return mmax; } 318 319 /// fills in Fm with computed Boys function values for m in [0,mmax] 320 /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long 321 /// @param[in] x the Boys function argument 322 /// @param[in] mmax the maximum value of m for which Boys function will be computed; mmax must be <= the value returned by max_m eval(Real * Fm,Real x,int m_max)323 inline void eval(Real* Fm, Real x, int m_max) const { 324 325 // large T => use upward recursion 326 // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls 327 if (x > T_crit) { 328 const double one_over_x = 1/x; 329 Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen 330 if (m_max == 0) 331 return; 332 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is small enough to guarantee full double precision 333 for (int i = 1; i <= m_max; i++) 334 Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13) 335 return; 336 } 337 338 // --------------------------------------------- 339 // small and intermediate arguments => interpolate Fm and (optional) downward recursion 340 // --------------------------------------------- 341 // which interval does this x fall into? 342 const Real x_over_delta = x * one_over_delta; 343 const int iv = int(x_over_delta); // the interval index 344 const Real xd = x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5 345 const int m_min = 0; 346 347 #if defined(__AVX__) 348 const auto x2 = xd*xd; 349 const auto x3 = x2*xd; 350 const auto x4 = x2*x2; 351 const auto x5 = x2*x3; 352 const auto x6 = x3*x3; 353 const auto x7 = x3*x4; 354 libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3); 355 libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7); 356 #endif // AVX 357 358 const Real *d = c + (ORDERp1) * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin 359 int m = m_min; 360 #if defined(__AVX__) 361 if (m_max-m >=3) { 362 const int unroll_size = 4; 363 const int m_fence = (m_max + 2 - unroll_size); 364 for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) { 365 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v, 366 d20v, d21v, d30v, d31v; 367 d00v.load_aligned(d); d01v.load_aligned((d+4)); 368 d10v.load_aligned(d+ORDERp1); d11v.load_aligned((d+4)+ORDERp1); 369 d20v.load_aligned(d+2*ORDERp1); d21v.load_aligned((d+4)+2*ORDERp1); 370 d30v.load_aligned(d+3*ORDERp1); d31v.load_aligned((d+4)+3*ORDERp1); 371 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec; 372 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec; 373 libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec; 374 libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec; 375 libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3); 376 sum0123.convert(&Fm[m]); 377 } 378 } // unroll_size=4 379 if (m_max-m >=1) { 380 const int unroll_size = 2; 381 const int m_fence = (m_max + 2 - unroll_size); 382 for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) { 383 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v; 384 d00v.load_aligned(d); 385 d01v.load_aligned((d+4)); 386 d10v.load_aligned(d+ORDERp1); 387 d11v.load_aligned((d+4)+ORDERp1); 388 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec; 389 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec; 390 libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1); 391 sum01.convert(&Fm[m]); 392 } 393 } // unroll_size=2 394 { // no unrolling 395 for(; m<=m_max; ++m, d+=ORDERp1) { 396 libint2::simd::VectorAVXDouble d0v, d1v; 397 d0v.load_aligned(d); 398 d1v.load_aligned(d+4); 399 Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec); 400 } 401 } 402 #else // AVX not available 403 for(m=m_min; m<=m_max; ++m, d+=ORDERp1) { 404 Fm[m] = d[0] 405 + xd * (d[1] 406 + xd * (d[2] 407 + xd * (d[3] 408 + xd * (d[4] 409 + xd * (d[5] 410 + xd * (d[6] 411 + xd * (d[7]))))))); 412 413 // // check against the reference value 414 // if (false) { 415 // double refvalue = FmEval_Reference2<double>::eval(x, m); // compute F(T) 416 // if (abs(refvalue - Fm[m]) > 1e-10) { 417 // std::cout << "T = " << x << " m = " << m << " cheb = " 418 // << Fm[m] << " ref = " << refvalue << std::endl; 419 // } 420 // } 421 } 422 #endif 423 424 425 } // eval() 426 427 private: 428 init_table()429 void init_table() { 430 431 // get memory 432 void* result; 433 int status = posix_memalign(&result, ORDERp1*sizeof(Real), (mmax + 1) * cheb_table_nintervals * ORDERp1 * sizeof(Real)); 434 if (status != 0) { 435 if (status == EINVAL) 436 throw std::logic_error( 437 "FmEval_Chebyshev7::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)"); 438 if (status == ENOMEM) 439 throw std::bad_alloc(); 440 abort(); // should be unreachable 441 } 442 c = static_cast<Real*>(result); 443 444 // copy contents of static table into c 445 // need all intervals 446 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) { 447 // but only values of m up to mmax 448 std::copy(cheb_table[iv], cheb_table[iv]+(mmax+1)*ORDERp1, c+(iv*(mmax+1))*ORDERp1); 449 } 450 } 451 452 }; // FmEval_Chebyshev7 453 454 #if LIBINT2_CONSTEXPR_STATICS 455 template <typename Real> 456 constexpr 457 double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)]; 458 #else 459 // clang needs an explicit specifalization declaration to avoid warning 460 # ifdef __clang__ 461 template <typename Real> 462 double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)]; 463 # endif 464 #endif 465 466 #ifndef STATIC_OON 467 #define STATIC_OON 468 namespace { 469 const double oon[] = {0.0, 1.0, 1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0, 1.0/6.0, 1.0/7.0, 1.0/8.0, 1.0/9.0, 1.0/10.0, 1.0/11.0}; 470 } 471 #endif 472 473 /** Computes the Boys function, \$ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u \$, 474 * using Taylor interpolation of up to 8-th order. 475 * @tparam Real the type to use for all floating-point computations. Following expressions must be valid: exp(Real), pow(Real), fabs(Real), max(Real), and floor(Real). 476 * @tparam INTERPOLATION_ORDER the interpolation order. The higher the order the less memory this object will need, but the computational cost will increase (usually very slightly) 477 */ 478 template<typename Real = double, int INTERPOLATION_ORDER = 7> 479 class FmEval_Taylor { 480 public: 481 static_assert(std::is_same<Real,double>::value, "FmEval_Taylor only supports double as the real type"); 482 483 static const int max_interp_order = 8; 484 static const bool INTERPOLATION_AND_RECURSION = false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use interpolation only 485 const double soft_zero_; 486 487 /// Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative \c precision 488 FmEval_Taylor(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) : 489 soft_zero_(1e-6), cutoff_(precision), numbers_( 490 INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER - 1)) { 491 492 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 493 # if !defined(__AVX__) && defined(NDEBUG) 494 if (libint2::verbose()) { 495 static bool printed_performance_warning = false; 496 if (!printed_performance_warning) { 497 libint2::verbose_stream() 498 << "libint2::FmEval_Taylor on x86(-64) platforms needs AVX support for best performance" 499 << std::endl; 500 printed_performance_warning = true; 501 } 502 } 503 # endif 504 #endif 505 506 assert(mmax <= 63); 507 508 const double sqrt_pi = std::sqrt(M_PI); 509 510 /*--------------------------------------- 511 We are doing Taylor interpolation with 512 n=TAYLOR_ORDER terms here: 513 error <= delT^n/(n+1)! 514 ---------------------------------------*/ 515 delT_ = 2.0 516 * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1], 517 1.0 / INTERPOLATION_ORDER); 518 oodelT_ = 1.0 / delT_; 519 max_m_ = mmax + INTERPOLATION_ORDER - 1; 520 521 T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/ 522 max_T_ = 0; 523 /*--- Figure out T_crit for each m and put into the T_crit ---*/ 524 for (int m = max_m_; m >= 0; --m) { 525 /*------------------------------------------ 526 Damped Newton-Raphson method to solve 527 T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5) 528 The solution is the max T for which to do 529 the interpolation 530 ------------------------------------------*/ 531 double T = -log(cutoff_); 532 const double egamma = cutoff_ * sqrt_pi * numbers_.df[2 * m] 533 / std::pow(2.0, m); 534 double T_new = T; 535 double func; 536 do { 537 const double damping_factor = 0.2; 538 T = T_new; 539 /* f(T) = the difference between LHS and RHS of the equation above */ 540 func = std::pow(T, m - 0.5) * std::exp(-T) - egamma; 541 const double dfuncdT = ((m - 0.5) * std::pow(T, m - 1.5) 542 - std::pow(T, m - 0.5)) * std::exp(-T); 543 /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to the left of the hump. Make a big step to the right. */ 544 if (dfuncdT > 0.0) { 545 T_new *= 2.0; 546 } else { 547 /* damp the step */ 548 double deltaT = -func / dfuncdT; 549 const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0; 550 const double max_deltaT = damping_factor * T; 551 if (std::fabs(deltaT) > max_deltaT) 552 deltaT = sign_deltaT * max_deltaT; 553 T_new = T + deltaT; 554 } 555 if (T_new <= 0.0) { 556 T_new = T / 2.0; 557 } 558 } while (std::fabs(func / egamma) >= soft_zero_); 559 T_crit_[m] = T_new; 560 const int T_idx = (int) std::floor(T_new / delT_); 561 max_T_ = std::max(max_T_, T_idx); 562 } 563 564 // allocate the grid (see the comments below) 565 { 566 const int nrow = max_T_ + 1; 567 const int ncol = max_m_ + 1; 568 grid_ = new Real*[nrow]; 569 grid_[0] = new Real[nrow * ncol]; 570 //std::cout << "Allocated interpolation table of " << nrow * ncol << " reals" << std::endl; 571 for (int r = 1; r < nrow; ++r) 572 grid_[r] = grid_[r - 1] + ncol; 573 } 574 575 /*------------------------------------------------------- 576 Tabulate the gamma function from t=delT to T_crit[m]: 577 1) include T=0 though the table is empty for T=0 since 578 Fm(0) is simple to compute 579 -------------------------------------------------------*/ 580 /*--- do the mmax first ---*/ 581 for (int T_idx = max_T_; T_idx >= 0; --T_idx) { 582 const double T = T_idx * delT_; 583 libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_); 584 } 585 } 586 ~FmEval_Taylor()587 ~FmEval_Taylor() { 588 delete[] T_crit_; 589 delete[] grid_[0]; 590 delete[] grid_; 591 } 592 593 /// Singleton interface allows to manage the lone instance; 594 /// adjusts max m and precision values as needed in thread-safe fashion 595 static std::shared_ptr<const FmEval_Taylor> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) { 596 assert(mmax >= 0); 597 assert(precision >= 0); 598 // thread-safe per C++11 standard [6.7.4] 599 static auto instance_ = std::make_shared<const FmEval_Taylor>(mmax, precision); 600 601 while (instance_->max_m() < mmax || instance_->precision() > precision) { 602 static std::mutex mtx; 603 std::lock_guard<std::mutex> lck(mtx); 604 if (instance_->max_m() < mmax || instance_->precision() > precision) { 605 auto new_instance = std::make_shared<const FmEval_Taylor>(mmax, precision); 606 instance_ = new_instance; 607 } 608 } 609 610 return instance_; 611 } 612 613 /// @return the maximum value of m for which this object can compute the Boys function max_m()614 int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; } 615 /// @return the precision with which this object can compute the Boys function precision()616 Real precision() const { return cutoff_; } 617 618 /// computes Boys function values with m index in range [0,mmax] 619 /// @param[out] Fm array to be filled in with the Boys function values, must be at least mmax+1 elements long 620 /// @param[in] x the Boys function argument 621 /// @param[in] mmax the maximum value of m for which Boys function will be computed; 622 /// it must be <= the value returned by max_m() (this is not checked) eval(Real * Fm,Real T,int mmax)623 void eval(Real* Fm, Real T, int mmax) const { 624 const double sqrt_pio2 = 1.2533141373155002512; 625 const double two_T = 2.0 * T; 626 627 // stop recursion at mmin 628 const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0; 629 /*------------------------------------- 630 Compute Fm(T) from mmax down to mmin 631 -------------------------------------*/ 632 const bool use_upward_recursion = true; 633 if (use_upward_recursion) { 634 // if (T > 30.0) { 635 if (T > T_crit_[0]) { 636 const double one_over_x = 1.0/T; 637 Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen 638 if (mmax == 0) 639 return; 640 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15 641 for (int i = 1; i <= mmax; i++) 642 Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13) 643 return; 644 } 645 } 646 647 // since Tcrit grows with mmax, this condition only needs to be determined once 648 if (T > T_crit_[mmax]) { 649 double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5); 650 for (int m = mmax; m >= mmin; --m) { 651 /*--- Asymptotic formula ---*/ 652 Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05; 653 pow_two_T_to_minusjp05 *= two_T; 654 } 655 } 656 else 657 { 658 const int T_ind = (int) (0.5 + T * oodelT_); 659 const Real h = T_ind * delT_ - T; 660 const Real* F_row = grid_[T_ind] + mmin; 661 662 #if defined (__AVX__) 663 libint2::simd::VectorAVXDouble h0123, h4567; 664 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) { 665 const double h2 = h*h*oon[2]; 666 const double h3 = h2*h*oon[3]; 667 h0123 = libint2::simd::VectorAVXDouble (1.0, h, h2, h3); 668 if (INTERPOLATION_ORDER == 7) { 669 const double h4 = h3*h*oon[4]; 670 const double h5 = h4*h*oon[5]; 671 const double h6 = h5*h*oon[6]; 672 const double h7 = h6*h*oon[7]; 673 h4567 = libint2::simd::VectorAVXDouble (h4, h5, h6, h7); 674 } 675 } 676 // libint2::simd::VectorAVXDouble h0123(1.0); 677 // libint2::simd::VectorAVXDouble h4567(1.0); 678 #endif 679 680 int m = mmin; 681 if (mmax-m >=1) { 682 const int unroll_size = 2; 683 const int m_fence = (mmax + 2 - unroll_size); 684 for(; m<m_fence; m+=unroll_size, F_row+=unroll_size) { 685 686 #if defined(__AVX__) 687 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) { 688 libint2::simd::VectorAVXDouble fr0_0123; fr0_0123.load(F_row); 689 libint2::simd::VectorAVXDouble fr1_0123; fr1_0123.load(F_row+1); 690 libint2::simd::VectorSSEDouble fm01 = horizontal_add(fr0_0123*h0123, fr1_0123*h0123); 691 if (INTERPOLATION_ORDER == 7) { 692 libint2::simd::VectorAVXDouble fr0_4567; fr0_4567.load(F_row+4); 693 libint2::simd::VectorAVXDouble fr1_4567; fr1_4567.load(F_row+5); 694 fm01 += horizontal_add(fr0_4567*h4567, fr1_4567*h4567); 695 } 696 fm01.convert(&Fm[m]); 697 } 698 else { 699 #endif 700 Real total0 = 0.0, total1 = 0.0; 701 for(int i=INTERPOLATION_ORDER; i>=1; --i) { 702 total0 = oon[i]*h*(F_row[i] + total0); 703 total1 = oon[i]*h*(F_row[i+1] + total1); 704 } 705 Fm[m] = F_row[0] + total0; 706 Fm[m+1] = F_row[1] + total1; 707 #if defined(__AVX__) 708 } 709 #endif 710 } 711 } // unroll_size = 2 712 if (m<=mmax) { // unroll_size = 1 713 #if defined(__AVX__) 714 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) { 715 libint2::simd::VectorAVXDouble fr0123; fr0123.load(F_row); 716 if (INTERPOLATION_ORDER == 7) { 717 libint2::simd::VectorAVXDouble fr4567; fr4567.load(F_row+4); 718 // libint2::simd::VectorSSEDouble fm = horizontal_add(fr0123*h0123, fr4567*h4567); 719 // Fm[m] = horizontal_add(fm); 720 Fm[m] = horizontal_add(fr0123*h0123 + fr4567*h4567); 721 } 722 else { // INTERPOLATION_ORDER == 3 723 Fm[m] = horizontal_add(fr0123*h0123); 724 } 725 } 726 else { 727 #endif 728 Real total = 0.0; 729 for(int i=INTERPOLATION_ORDER; i>=1; --i) { 730 total = oon[i]*h*(F_row[i] + total); 731 } 732 Fm[m] = F_row[0] + total; 733 #if defined(__AVX__) 734 } 735 #endif 736 } // unroll_size = 1 737 738 // check against the reference value 739 // if (false) { 740 // double refvalue = FmEval_Reference2<double>::eval(T, mmax); // compute F(T) with m=mmax 741 // if (abs(refvalue - Fm[mmax]) > 1e-14) { 742 // std::cout << "T = " << T << " m = " << mmax << " cheb = " 743 // << Fm[mmax] << " ref = " << refvalue << std::endl; 744 // } 745 // } 746 747 } // if T < T_crit 748 749 /*------------------------------------ 750 And then do downward recursion in j 751 ------------------------------------*/ 752 if (INTERPOLATION_AND_RECURSION && mmin > 0) { 753 const Real exp_mT = std::exp(-T); 754 for (int m = mmin - 1; m >= 0; --m) { 755 Fm[m] = (exp_mT + two_T * Fm[m+1]) * numbers_.twoi1[m]; 756 } 757 } 758 } 759 760 private: 761 Real **grid_; /* Table of "exact" Fm(T) values. Row index corresponds to 762 values of T (max_T+1 rows), column index to values 763 of m (max_m+1 columns) */ 764 Real delT_; /* The step size for T, depends on cutoff */ 765 Real oodelT_; /* 1.0 / delT_, see above */ 766 Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */ 767 int max_m_; /* Maximum value of m in the table, depends on cutoff 768 and the number of terms in Taylor interpolation */ 769 int max_T_; /* Maximum index of T in the table, depends on cutoff 770 and m */ 771 Real *T_crit_; /* Maximum T for each row, depends on cutoff; 772 for a given m and T_idx <= max_T_idx[m] use Taylor interpolation, 773 for a given m and T_idx > max_T_idx[m] use the asymptotic formula */ 774 775 ExpensiveNumbers<double> numbers_; 776 777 /** 778 * Power series estimate of the error introduced by replacing 779 * \f$ F_m(T) = \int_0^1 \exp(-T t^2) t^{2 m} \, \mathrm{d} t \f$ with analytically 780 * integrable \f$ \int_0^\infty \exp(-T t^2) t^{2 m} \, \mathrm{d} t = \frac{(2m-1)!!}{2^{m+1}} \sqrt{\frac{\pi}{T^{2m+1}}} \f$ 781 * @param m 782 * @param T 783 * @return the error estimate 784 */ truncation_error(unsigned int m,double T)785 static double truncation_error(unsigned int m, double T) { 786 const double m2= m * m; 787 const double m3= m2 * m; 788 const double m4= m2 * m2; 789 const double T2= T * T; 790 const double T3= T2 * T; 791 const double T4= T2 * T2; 792 const double T5= T2 * T3; 793 794 const double result = exp(-T) * (105 + 16*m4 + 16*m3*(T - 8) - 30*T + 12*T2 795 - 8*T3 + 16*T4 + 8*m2*(43 - 9*T + 2*T2) + 796 4*m*(-88 + 23*T - 8*T2 + 4*T3))/(32*T5); 797 return result; 798 } 799 /** 800 * Leading-order estimate of the error introduced by replacing 801 * \f$ F_m(T) = \int_0^1 \exp(-T t^2) t^{2 m} \, \mathrm{d} t \f$ with analytically 802 * integrable \f$ \int_0^\infty \exp(-T t^2) t^{2 m} \, \mathrm{d} t = \frac{(2m-1)!!}{2^{m+1}} \sqrt{\frac{\pi}{T^{2m+1}}} \f$ 803 * @param m 804 * @param T 805 * @return the error estimate 806 */ truncation_error(double T)807 static double truncation_error(double T) { 808 const double result = exp(-T) /(2*T); 809 return result; 810 } 811 }; 812 813 814 namespace detail { pow10(long exp)815 constexpr double pow10(long exp) { 816 return (exp == 0) ? 1. : ((exp > 0) ? 10. * pow10(exp-1) : 0.1 * pow10(exp+1)); 817 } 818 } 819 820 ////////////////////////////////////////////////////////// 821 /// core integral for Yukawa and exponential interactions 822 ////////////////////////////////////////////////////////// 823 824 /** 825 * Evaluates core integral for Gaussian integrals over the Yukawa potential \f$ \exp(- \zeta r) / r \f$ and 826 * the exponential interaction \f$ \exp(- \zeta r) \f$ 827 * @tparam Real real type 828 * @warning only @p Real = double is supported 829 * @warning guarantees absolute precision of only about 1e-14 830 */ 831 template<typename Real = double> 832 struct TennoGmEval { 833 834 private: 835 836 #include <libint2/tenno_cheb.h> 837 838 static_assert(std::is_same<Real,double>::value, "TennoGmEval only supports double as the real type"); 839 840 static const int mmin_ = -1; 841 static constexpr Real Tmax = (1 << cheb_table_tmaxlog2); //!< critical value of T above which use upward recursion 842 static constexpr Real Umax = detail::pow10(cheb_table_umaxlog10); //!< max value of U for which to interpolate 843 static constexpr Real Umin = detail::pow10(cheb_table_uminlog10); //!< min value of U for which to interpolate 844 static constexpr std::size_t ORDERp1 = interpolation_order + 1; 845 static constexpr Real maxabsprecision = 1.4e-14; //!< guaranteed abs precision of the interpolation table for m>0 846 847 public: 848 /// \param m_max maximum value of the Gm function index 849 /// \param precision the desired *absolute* precision (relative precision for most intervals will be below epsilon, but for large T/U values and high m relative precision is low 850 /// \throw std::invalid_argument if \c m_max is greater than \c cheb_table_mmax (see tenno_cheb.h) 851 /// \throw std::invalid_argument if \c precision is smaller than \c maxabsprecision 852 TennoGmEval(unsigned int mmax, Real precision = -1) : mmax_TennoGmEval853 mmax_(mmax), precision_(precision), 854 numbers_() { 855 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 856 # if !defined(__AVX__) && defined(NDEBUG) 857 if (libint2::verbose()) { 858 static bool printed_performance_warning = false; 859 if (!printed_performance_warning) { 860 libint2::verbose_stream() 861 << "libint2::TennoGmEval on x86(-64) platforms needs AVX support for best performance" 862 << std::endl; 863 printed_performance_warning = true; 864 } 865 } 866 # endif 867 #endif 868 869 // if (precision_ < maxabsprecision) 870 // throw std::invalid_argument( 871 // std::string( 872 // "TennoGmEval does not support precision smaller than ") + 873 // std::to_string(maxabsprecision)); 874 875 if (mmax > cheb_table_mmax) 876 throw std::invalid_argument( 877 "TennoGmEval::init() : requested mmax exceeds the " 878 "hard-coded mmax"); 879 init_table(); 880 } 881 ~TennoGmEvalTennoGmEval882 ~TennoGmEval() { 883 if (c_ != nullptr) 884 free(c_); 885 } 886 887 /// Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe fashion 888 static std::shared_ptr<const TennoGmEval> instance(int m_max, double = 0) { 889 890 assert(m_max >= 0); 891 // thread-safe per C++11 standard [6.7.4] 892 static auto instance_ = std::make_shared<const TennoGmEval>(m_max); 893 894 while (instance_->max_m() < m_max) { 895 static std::mutex mtx; 896 std::lock_guard<std::mutex> lck(mtx); 897 if (instance_->max_m() < m_max) { 898 auto new_instance = std::make_shared<const TennoGmEval>(m_max); 899 instance_ = new_instance; 900 } 901 } 902 903 return instance_; 904 } 905 max_mTennoGmEval906 unsigned int max_m() const { return mmax_; } 907 /// @return the precision with which this object can compute the result precisionTennoGmEval908 Real precision() const { return precision_; } 909 910 /// @param[in] Gm pointer to array of @c mmax+1 @c Real elements, on 911 /// return this contains the core integral for Yukawa 912 /// interaction, \f$ \exp(-zeta r_{12})/r_{12} \f$ , 913 /// namely \f$ G_{m}(T,U) = \int_0^1 t^{2m} \exp(U(1-t^{-2}) - Tt^2) dt, m \in [0,m_\mathrm{max}] \f$, where 914 /// \f$ T = \rho |\vec{P} - \vec{Q}|^2 \f$ and 915 /// \f$ U = \zeta^2 / (4 \rho) \f$ 916 /// @param[in] one_over_rho \f$ 1/\rho \f$ 917 /// @param[in] T \f$ T \f$ 918 /// @param[in] mmax \f$ m_\mathrm{max} \f$ 919 /// @param[in] zeta \f$ \zeta \f$ eval_yukawaTennoGmEval920 void eval_yukawa(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const { 921 assert(mmax <= mmax_); 922 assert(T >= 0); 923 const auto U = 0.25 * zeta * zeta * one_over_rho; 924 assert(U >= 0); 925 if (T > Tmax || U < Umin) { 926 eval_Gm_urr(Gm, T, U, mmax, 0); // no need for G_-1 927 } else { 928 interpolate_Gm<false>(Gm, T, U, 0, mmax); 929 } 930 } 931 /// @param[in] Gm pointer to array of @c mmax+1 @c Real elements, on 932 /// return this contains the core integral for Slater-type 933 /// geminal, \f$ \exp(-zeta r_{12}) \f$ , 934 /// namely \f$ \sqrt{U} (G_{m-1}(T,U) - G_m(T,U)), m \in [0,m_\mathrm{max}] \f$ where\ 935 // \f$ G_{m}(T,U) = \int_0^1 t^{2m} \exp(U(1-t^{-2}) - Tt^2) dt \f$, where 936 /// \f$ T = \rho |\vec{P} - \vec{Q}|^2 \f$ and 937 /// \f$ U = \zeta^2 / (4 \rho) \f$ 938 /// @param[in] one_over_rho \f$ 1/\rho \f$ 939 /// @param[in] T \f$ T \f$ 940 /// @param[in] mmax \f$ m_\mathrm{max} \f$ 941 /// @param[in] zeta \f$ \zeta \f$ eval_slaterTennoGmEval942 void eval_slater(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const { 943 assert(mmax <= mmax_); 944 assert(T >= 0); 945 const auto U = 0.25 * zeta * zeta * one_over_rho; 946 assert(U > 0); // integral factorizes into 2 overlaps for U = 0 947 const auto zeta_over_two_rho = 0.5 * zeta * one_over_rho; 948 if (T > Tmax) { 949 eval_Gm_urr(Gm, T, U, mmax, -1); 950 } else { 951 interpolate_Gm<true>(Gm, T, U, zeta_over_two_rho, mmax); 952 } 953 } 954 955 private: 956 957 /// Computes G_0 and (optionally) G_{-1} 958 /// @tparam compute_Gm1, if true, compute G_0 and G_{-1}, otherwise G_0 only 959 /// @param[in] T the value of \f$ T \f$ 960 /// @param[in] U the value of \f$ U \f$ 961 /// @return if @c compute_Gm1==true return {G_0,G_{-1}}, otherwise {G_0,0} 962 template <bool compute_Gm1> eval_G0_and_maybe_Gm1TennoGmEval963 static inline std::tuple<Real,Real> eval_G0_and_maybe_Gm1(Real T, Real U) { 964 assert(T >= 0); 965 assert(U >= 0); 966 const Real sqrtPi(1.7724538509055160272981674833411451827975494561224); 967 968 Real G_m1 = 0; 969 Real G_0 = 0; 970 if (U == 0) { // \sqrt{U} G_{-1} is finite, need to handle that case separately 971 assert(!compute_Gm1); 972 if (T < std::numeric_limits<Real>::epsilon()) { 973 G_0 = 1; 974 } 975 else { 976 const Real sqrt_T = sqrt(T); 977 const Real sqrtPi_over_2 = sqrtPi * 0.5; 978 G_0 = sqrtPi_over_2 * erf(sqrt_T) / sqrt_T; 979 } 980 } 981 else if (T > std::numeric_limits<Real>::epsilon()) { // U > 0 982 const Real sqrt_U = sqrt(U); 983 const Real sqrt_T = sqrt(T); 984 const Real oo_sqrt_T = 1 / sqrt_T; 985 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0; 986 const Real kappa = sqrt_U - sqrt_T; 987 const Real lambda = sqrt_U + sqrt_T; 988 const Real sqrtPi_over_4 = sqrtPi * 0.25; 989 const Real pfac = sqrtPi_over_4; 990 const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa); 991 const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda); 992 993 G_m1 = compute_Gm1 ? pfac * (erfc_k + erfc_l) * oo_sqrt_U : 0.0; 994 G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T; 995 } 996 else { // T = 0, U > 0 997 const Real exp_U = exp(U); 998 const Real sqrt_U = sqrt(U); 999 if (compute_Gm1) { 1000 const Real sqrtPi_over_2 = sqrtPi * 0.5; 1001 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0; 1002 1003 G_m1 = exp_U * sqrtPi_over_2 * erfc(sqrt_U) * oo_sqrt_U; 1004 } 1005 G_0 = 1 - exp_U * sqrtPi * erfc(sqrt_U) * sqrt_U; 1006 } 1007 1008 return std::make_tuple(G_0, G_m1); 1009 } 1010 1011 /// Computes core integrals by upward recursion from \f$ G_{-1} (T,U) \f$ and \f$ G_0 (T,U) \f$ 1012 /// this is unstable for small T, so use when interpolation does not apply. 1013 /// @param[out] Gm \f$ G_m(T,U), m=mmin..mmax \f$ , i.e. @c Gm[m] \f$ = G_{\mathrm{mmin} + m}(T,U) \f$ 1014 /// @param[in] T the value of \f$ T \f$ 1015 /// @param[in] U the value of \f$ U \f$ 1016 /// @param[in] mmax the maximum value of \f$ m \f$ 1017 /// @param[in] mmin the minimum value of \f$ m \f$ ; the only valid values are 0 and -1 eval_Gm_urrTennoGmEval1018 static void eval_Gm_urr(Real* Gm, Real T, Real U, size_t mmax, long mmin) { 1019 assert(mmin == 0 || mmin == -1); 1020 assert(T > 0); 1021 assert(U > 0); 1022 1023 const Real sqrt_U = sqrt(U); 1024 const Real sqrt_T = sqrt(T); 1025 const Real oo_sqrt_T = 1 / sqrt_T; 1026 const Real oo_sqrt_U = 1 / sqrt_U; 1027 const Real kappa = sqrt_U - sqrt_T; 1028 const Real lambda = sqrt_U + sqrt_T; 1029 const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060); 1030 const Real pfac = sqrtPi_over_4; 1031 const Real erfc_k = exp(kappa*kappa - T) * erfc(kappa); 1032 const Real erfc_l = exp(lambda*lambda - T) * erfc(lambda); 1033 1034 Real G_m1_value; 1035 Real& G_m1 = (mmin == -1) ? Gm[0] : G_m1_value; 1036 G_m1 = pfac * (erfc_k + erfc_l) * oo_sqrt_U; 1037 Real& G_0 = (mmin == -1) ? Gm[1] : Gm[0]; 1038 G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T; 1039 1040 if (mmax > 0) { 1041 1042 // first application of URR 1043 const Real oo_two_T = 0.5 / T; 1044 const Real two_U = 2.0 * U; 1045 const Real exp_mT = exp(-T); 1046 1047 Real* Gmm1 = &G_m1; 1048 Real* Gm0 = &G_0; 1049 Real* Gmp1 = Gm0 + 1; 1050 for(unsigned int m=1, two_m_minus_1=1; m<=mmax; ++m, two_m_minus_1+=2, ++Gmp1) { 1051 *Gmp1 = oo_two_T * ( two_m_minus_1 * *Gm0 + two_U * *Gmm1 - exp_mT); 1052 Gmm1 = Gm0; 1053 Gm0 = Gmp1; 1054 } 1055 } 1056 1057 return; 1058 } 1059 1060 /// compute core integrals by Chebyshev interpolation 1061 /// @tparam[in] exp if true, will compute core integrals of the exponential operator, otherwise will compute Yukawa 1062 /// @param[out] Gm_vec if @c exp==false Gm_vec[m]=\f$ G_m(T,U), m=0..mmax \f$, otherwise Gm_vec[m]=\f$ \frac{\zeta}{2 \rho} \left( G_{m-1}(T,U) - G_m(T,U) \right), m=0..mmax \f$ ; must be at least @c mmax+1 elements long 1063 /// @param[in] T the value of \f$ T \f$ 1064 /// @param[in] U the value of \f$ U \f$ 1065 /// @param[in] zeta_over_two_rho the value of \f$ \frac{\zeta}{2 \rho} \f$, only used if @c exp==true 1066 /// @param[in] mmax the maximum value of \f$ m \f$ 1067 template <bool exp> interpolate_GmTennoGmEval1068 void interpolate_Gm(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho, long mmax) const { 1069 assert(T >= 0); 1070 assert(U >= Umin && U <= Umax); 1071 1072 // maps x in [0,2] to [-1/2,1/2] in a linear fashion 1073 auto linear_map_02 = [](Real x) { 1074 assert(x >= 0 && x <= 2); 1075 return (x - 1) * 0.5; 1076 }; 1077 // maps x in [a, 2*a] to [-1/2,1/2] in a logarithmic fashion 1078 auto log2_map = [](Real x, Real one_over_a) { 1079 return std::log2(x * one_over_a) - 0.5; 1080 }; 1081 // maps x in [a, 10*a] to [-1/2,1/2] in a logarithmic fashion 1082 auto log10_map = [](Real x, Real one_over_a) { 1083 return std::log10(x * one_over_a) - 0.5; 1084 }; 1085 1086 // which interval does this T fall into? 1087 const int Tint = T < 2 ? 0 : int(std::floor(std::log2(T))); assert(Tint >= 0 && Tint < 10); 1088 // precomputed 1 / 2^K 1089 constexpr Real one_over_2K[] = {1, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, .001953125}; 1090 // which interval does this U fall into? 1091 const int Uint = int(std::floor(std::log10(U / Umin))); assert(Uint >= 0 && Uint < 10); 1092 // precomputed 1 / 10^(cheb_table_uminlog10 + K) 1093 constexpr Real one_over_10K[] = {1. / detail::pow10(cheb_table_uminlog10), 1094 1. / detail::pow10(cheb_table_uminlog10+1), 1095 1. / detail::pow10(cheb_table_uminlog10+2), 1096 1. / detail::pow10(cheb_table_uminlog10+3), 1097 1. / detail::pow10(cheb_table_uminlog10+4), 1098 1. / detail::pow10(cheb_table_uminlog10+5), 1099 1. / detail::pow10(cheb_table_uminlog10+6), 1100 1. / detail::pow10(cheb_table_uminlog10+7), 1101 1. / detail::pow10(cheb_table_uminlog10+8), 1102 1. / detail::pow10(cheb_table_uminlog10+9)}; 1103 const Real t = Tint == 0 ? linear_map_02(T) : log2_map(T, one_over_2K[Tint]); // this ranges from -0.5 to 0.5 1104 const Real u = log10_map(U, one_over_10K[Uint]); // this ranges from -0.5 to 0.5 1105 1106 const int interval = Tint * 10 + Uint; 1107 1108 #if defined(__AVX__) 1109 assert(ORDERp1 == 16); 1110 const auto t2 = t*t; 1111 const auto t3 = t2*t; 1112 const auto t4 = t2*t2; 1113 const auto t5 = t2*t3; 1114 const auto t6 = t3*t3; 1115 const auto t7 = t3*t4; 1116 const auto t8 = t4*t4; 1117 const auto t9 = t4*t5; 1118 const auto t10 = t5*t5; 1119 const auto t11 = t5*t6; 1120 const auto t12 = t6*t6; 1121 const auto t13 = t6*t7; 1122 const auto t14 = t7*t7; 1123 const auto t15 = t7*t8; 1124 const auto u2 = u*u; 1125 const auto u3 = u2*u; 1126 const auto u4 = u2*u2; 1127 const auto u5 = u2*u3; 1128 const auto u6 = u3*u3; 1129 const auto u7 = u3*u4; 1130 const auto u8 = u4*u4; 1131 const auto u9 = u4*u5; 1132 const auto u10 = u5*u5; 1133 const auto u11 = u5*u6; 1134 const auto u12 = u6*u6; 1135 const auto u13 = u6*u7; 1136 const auto u14 = u7*u7; 1137 const auto u15 = u7*u8; 1138 libint2::simd::VectorAVXDouble u0vec(1., u, u2, u3); 1139 libint2::simd::VectorAVXDouble u1vec(u4, u5, u6, u7); 1140 libint2::simd::VectorAVXDouble u2vec(u8, u9, u10, u11); 1141 libint2::simd::VectorAVXDouble u3vec(u12, u13, u14, u15); 1142 #endif // AVX 1143 1144 Real Gmm1 = 0.0; // will track the previous value, only used for exp==false 1145 1146 constexpr bool compute_Gmm10 = true; 1147 long mmin_interp; 1148 if (compute_Gmm10) { 1149 // precision of interpolation for m=-1,0 can be insufficient, just evaluate explicitly 1150 Real G0; 1151 std::tie(exp ? G0 : Gm_vec[0], Gmm1) = eval_G0_and_maybe_Gm1<exp>(T, U); 1152 if (exp) { 1153 Gm_vec[0] = (Gmm1 - G0) * zeta_over_two_rho; 1154 Gmm1 = G0; 1155 } 1156 mmin_interp = 1; 1157 } 1158 else 1159 mmin_interp = exp ? -1 : 0; 1160 1161 // now compute the rest 1162 for(long m=mmin_interp; m<=mmax; ++m){ 1163 const Real *c_tuint = c_ + (ORDERp1) * (ORDERp1) * (interval * (mmax_ - mmin_ + 1) + (m - mmin_)); // ptr to the interpolation data for m=mmin 1164 #if defined(__AVX__) 1165 libint2::simd::VectorAVXDouble c00v, c01v, c02v, c03v, c10v, c11v, c12v, c13v, 1166 c20v, c21v, c22v, c23v, c30v, c31v, c32v, c33v, 1167 c40v, c41v, c42v, c43v, c50v, c51v, c52v, c53v, 1168 c60v, c61v, c62v, c63v, c70v, c71v, c72v, c73v; 1169 libint2::simd::VectorAVXDouble c80v, c81v, c82v, c83v, c90v, c91v, c92v, c93v, 1170 ca0v, ca1v, ca2v, ca3v, cb0v, cb1v, cb2v, cb3v, 1171 cc0v, cc1v, cc2v, cc3v, cd0v, cd1v, cd2v, cd3v, 1172 ce0v, ce1v, ce2v, ce3v, cf0v, cf1v, cf2v, cf3v; 1173 c00v.load_aligned(c_tuint); c01v.load_aligned((c_tuint+4)); 1174 c02v.load_aligned(c_tuint+8); c03v.load_aligned((c_tuint+12)); 1175 libint2::simd::VectorAVXDouble t0vec(1, 1, 1, 1); 1176 libint2::simd::VectorAVXDouble t0 = t0vec * (c00v * u0vec + c01v * u1vec + c02v * u2vec + c03v * u3vec); 1177 c10v.load_aligned(c_tuint +ORDERp1); c11v.load_aligned((c_tuint+4) +ORDERp1); 1178 c12v.load_aligned((c_tuint+8) +ORDERp1); c13v.load_aligned((c_tuint+12) +ORDERp1); 1179 libint2::simd::VectorAVXDouble t1vec(t, t, t, t); 1180 libint2::simd::VectorAVXDouble t1 = t1vec * (c10v * u0vec + c11v * u1vec + c12v * u2vec + c13v * u3vec); 1181 c20v.load_aligned(c_tuint +2*ORDERp1); c21v.load_aligned((c_tuint+4) +2*ORDERp1); 1182 c22v.load_aligned((c_tuint+8)+2*ORDERp1); c23v.load_aligned((c_tuint+12)+2*ORDERp1); 1183 libint2::simd::VectorAVXDouble t2vec(t2, t2, t2, t2); 1184 libint2::simd::VectorAVXDouble t2 = t2vec * (c20v * u0vec + c21v * u1vec + c22v * u2vec + c23v * u3vec); 1185 c30v.load_aligned(c_tuint +3*ORDERp1); c31v.load_aligned((c_tuint+4) +3*ORDERp1); 1186 c32v.load_aligned((c_tuint+8)+3*ORDERp1); c33v.load_aligned((c_tuint+12)+3*ORDERp1); 1187 libint2::simd::VectorAVXDouble t3vec(t3, t3, t3, t3); 1188 libint2::simd::VectorAVXDouble t3 = t3vec * (c30v * u0vec + c31v * u1vec + c32v * u2vec + c33v * u3vec); 1189 libint2::simd::VectorAVXDouble t0123 = horizontal_add(t0, t1, t2, t3); 1190 1191 c40v.load_aligned(c_tuint +4*ORDERp1); c41v.load_aligned((c_tuint+4) +4*ORDERp1); 1192 c42v.load_aligned((c_tuint+8)+4*ORDERp1); c43v.load_aligned((c_tuint+12)+4*ORDERp1); 1193 libint2::simd::VectorAVXDouble t4vec(t4, t4, t4, t4); 1194 libint2::simd::VectorAVXDouble t4 = t4vec * (c40v * u0vec + c41v * u1vec + c42v * u2vec + c43v * u3vec); 1195 c50v.load_aligned(c_tuint +5*ORDERp1); c51v.load_aligned((c_tuint+4) +5*ORDERp1); 1196 c52v.load_aligned((c_tuint+8)+5*ORDERp1); c53v.load_aligned((c_tuint+12)+5*ORDERp1); 1197 libint2::simd::VectorAVXDouble t5vec(t5, t5, t5, t5); 1198 libint2::simd::VectorAVXDouble t5 = t5vec * (c50v * u0vec + c51v * u1vec + c52v * u2vec + c53v * u3vec); 1199 c60v.load_aligned(c_tuint +6*ORDERp1); c61v.load_aligned((c_tuint+4) +6*ORDERp1); 1200 c62v.load_aligned((c_tuint+8)+6*ORDERp1); c63v.load_aligned((c_tuint+12)+6*ORDERp1); 1201 libint2::simd::VectorAVXDouble t6vec(t6, t6, t6, t6); 1202 libint2::simd::VectorAVXDouble t6 = t6vec * (c60v * u0vec + c61v * u1vec + c62v * u2vec + c63v * u3vec); 1203 c70v.load_aligned(c_tuint +7*ORDERp1); c71v.load_aligned((c_tuint+4) +7*ORDERp1); 1204 c72v.load_aligned((c_tuint+8)+7*ORDERp1); c73v.load_aligned((c_tuint+12)+7*ORDERp1); 1205 libint2::simd::VectorAVXDouble t7vec(t7, t7, t7, t7); 1206 libint2::simd::VectorAVXDouble t7 = t7vec * (c70v * u0vec + c71v * u1vec + c72v * u2vec + c73v * u3vec); 1207 libint2::simd::VectorAVXDouble t4567 = horizontal_add(t4, t5, t6, t7); 1208 1209 c80v.load_aligned(c_tuint +8*ORDERp1); c81v.load_aligned((c_tuint+4) +8*ORDERp1); 1210 c82v.load_aligned((c_tuint+8)+8*ORDERp1); c83v.load_aligned((c_tuint+12)+8*ORDERp1); 1211 libint2::simd::VectorAVXDouble t8vec(t8, t8, t8, t8); 1212 libint2::simd::VectorAVXDouble t8 = t8vec * (c80v * u0vec + c81v * u1vec + c82v * u2vec + c83v * u3vec); 1213 c90v.load_aligned(c_tuint +9*ORDERp1); c91v.load_aligned((c_tuint+4) +9*ORDERp1); 1214 c92v.load_aligned((c_tuint+8)+9*ORDERp1); c93v.load_aligned((c_tuint+12)+9*ORDERp1); 1215 libint2::simd::VectorAVXDouble t9vec(t9, t9, t9, t9); 1216 libint2::simd::VectorAVXDouble t9 = t9vec * (c90v * u0vec + c91v * u1vec + c92v * u2vec + c93v * u3vec); 1217 ca0v.load_aligned(c_tuint +10*ORDERp1); ca1v.load_aligned((c_tuint+4) +10*ORDERp1); 1218 ca2v.load_aligned((c_tuint+8)+10*ORDERp1); ca3v.load_aligned((c_tuint+12)+10*ORDERp1); 1219 libint2::simd::VectorAVXDouble tavec(t10, t10, t10, t10); 1220 libint2::simd::VectorAVXDouble ta = tavec * (ca0v * u0vec + ca1v * u1vec + ca2v * u2vec + ca3v * u3vec); 1221 cb0v.load_aligned(c_tuint +11*ORDERp1); cb1v.load_aligned((c_tuint+4) +11*ORDERp1); 1222 cb2v.load_aligned((c_tuint+8)+11*ORDERp1); cb3v.load_aligned((c_tuint+12)+11*ORDERp1); 1223 libint2::simd::VectorAVXDouble tbvec(t11, t11, t11, t11); 1224 libint2::simd::VectorAVXDouble tb = tbvec * (cb0v * u0vec + cb1v * u1vec + cb2v * u2vec + cb3v * u3vec); 1225 libint2::simd::VectorAVXDouble t89ab = horizontal_add(t8, t9, ta, tb); 1226 1227 cc0v.load_aligned(c_tuint +12*ORDERp1); cc1v.load_aligned((c_tuint+4) +12*ORDERp1); 1228 cc2v.load_aligned((c_tuint+8)+12*ORDERp1); cc3v.load_aligned((c_tuint+12)+12*ORDERp1); 1229 libint2::simd::VectorAVXDouble tcvec(t12, t12, t12, t12); 1230 libint2::simd::VectorAVXDouble tc = tcvec * (cc0v * u0vec + cc1v * u1vec + cc2v * u2vec + cc3v * u3vec); 1231 cd0v.load_aligned(c_tuint +13*ORDERp1); cd1v.load_aligned((c_tuint+4) +13*ORDERp1); 1232 cd2v.load_aligned((c_tuint+8)+13*ORDERp1); cd3v.load_aligned((c_tuint+12)+13*ORDERp1); 1233 libint2::simd::VectorAVXDouble tdvec(t13, t13, t13, t13); 1234 libint2::simd::VectorAVXDouble td = tdvec * (cd0v * u0vec + cd1v * u1vec + cd2v * u2vec + cd3v * u3vec); 1235 ce0v.load_aligned(c_tuint +14*ORDERp1); ce1v.load_aligned((c_tuint+4) +14*ORDERp1); 1236 ce2v.load_aligned((c_tuint+8)+14*ORDERp1); ce3v.load_aligned((c_tuint+12)+14*ORDERp1); 1237 libint2::simd::VectorAVXDouble tevec(t14, t14, t14, t14); 1238 libint2::simd::VectorAVXDouble te = tevec * (ce0v * u0vec + ce1v * u1vec + ce2v * u2vec + ce3v * u3vec); 1239 cf0v.load_aligned(c_tuint +15*ORDERp1); cf1v.load_aligned((c_tuint+4)+15*ORDERp1); 1240 cf2v.load_aligned((c_tuint+8)+15*ORDERp1); cf3v.load_aligned((c_tuint+4)+15*ORDERp1); 1241 libint2::simd::VectorAVXDouble tfvec(t15, t15, t15, t15); 1242 libint2::simd::VectorAVXDouble tf = tfvec * (cf0v * u0vec + cf1v * u1vec + cf2v * u2vec + cf3v * u3vec); 1243 libint2::simd::VectorAVXDouble tcdef = horizontal_add(tc, td, te, tf); 1244 1245 auto tall = horizontal_add(t0123, t4567, t89ab, tcdef); 1246 const auto Gm = horizontal_add(tall); 1247 #else // AVX 1248 // no AVX, use explicit loops (probably slow) 1249 double uvec[16]; 1250 double tvec[16]; 1251 uvec[0] = 1; 1252 tvec[0] = 1; 1253 for(int i=1; i!=16; ++i) { 1254 uvec[i] = uvec[i-1] * u; 1255 tvec[i] = tvec[i-1] * t; 1256 } 1257 double Gm = 0.0; 1258 for(int i=0, ij=0; i!=16; ++i) { 1259 for (int j = 0; j != 16; ++j, ++ij) { 1260 Gm += c_tuint[ij] * tvec[i] * uvec[j]; 1261 } 1262 } 1263 #endif // AVX 1264 1265 if (!exp) { 1266 Gm_vec[m] = Gm; 1267 } 1268 else { 1269 if (m != -1) { 1270 Gm_vec[m] = (Gmm1 - Gm) * zeta_over_two_rho; 1271 } 1272 Gmm1 = Gm; 1273 } 1274 } 1275 1276 return; 1277 } 1278 1279 private: 1280 unsigned int mmax_; 1281 Real precision_; 1282 ExpensiveNumbers<Real> numbers_; 1283 Real* c_ = nullptr; 1284 init_tableTennoGmEval1285 void init_table() { 1286 1287 // get memory 1288 void* result; 1289 int status = posix_memalign(&result, std::max(sizeof(Real), 32ul), (mmax_ - mmin_ + 1) * cheb_table_nintervals * ORDERp1 * ORDERp1 * sizeof(Real)); 1290 if (status != 0) { 1291 if (status == EINVAL) 1292 throw std::logic_error( 1293 "TennoGmEval::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)"); 1294 if (status == ENOMEM) 1295 throw std::bad_alloc(); 1296 abort(); // should be unreachable 1297 } 1298 c_ = static_cast<Real*>(result); 1299 1300 // copy contents of static table into c 1301 // need all intervals 1302 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) { 1303 // but only values of m up to mmax 1304 std::copy(cheb_table[iv], cheb_table[iv]+((mmax_-mmin_)+1)*ORDERp1*ORDERp1, c_+(iv*((mmax_-mmin_)+1))*ORDERp1*ORDERp1); 1305 } 1306 } 1307 1308 1309 }; // TennoGmEval 1310 1311 #if LIBINT2_CONSTEXPR_STATICS 1312 template <typename Real> 1313 constexpr 1314 double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)]; 1315 #else 1316 // clang needs an explicit specifalization declaration to avoid warning 1317 # ifdef __clang__ 1318 template <typename Real> 1319 double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)]; 1320 # endif 1321 #endif 1322 1323 template<typename Real, int k> 1324 struct GaussianGmEval; 1325 1326 namespace detail { 1327 1328 /// some evaluators need thread-local scratch, but most don't 1329 template <typename CoreEval> struct CoreEvalScratch { 1330 CoreEvalScratch(const CoreEvalScratch&) = default; 1331 CoreEvalScratch(CoreEvalScratch&&) = default; CoreEvalScratchCoreEvalScratch1332 explicit CoreEvalScratch(int) { } 1333 }; 1334 /// GaussianGmEval<Real,-1> needs extra scratch data 1335 template <typename Real> 1336 struct CoreEvalScratch<GaussianGmEval<Real, -1>> { 1337 std::vector<Real> Fm_; 1338 std::vector<Real> g_i; 1339 std::vector<Real> r_i; 1340 std::vector<Real> oorhog_i; 1341 CoreEvalScratch(const CoreEvalScratch&) = default; 1342 CoreEvalScratch(CoreEvalScratch&&) = default; 1343 explicit CoreEvalScratch(int mmax) { 1344 init(mmax); 1345 } 1346 private: 1347 void init(int mmax) { 1348 Fm_.resize(mmax+1); 1349 g_i.resize(mmax+1); 1350 r_i.resize(mmax+1); 1351 oorhog_i.resize(mmax+1); 1352 g_i[0] = 1.0; 1353 r_i[0] = 1.0; 1354 } 1355 }; 1356 } // namespace libint2::detail 1357 1358 ////////////////////////////////////////////////////////// 1359 /// core integrals r12^k \sum_i \exp(- a_i r_12^2) 1360 ////////////////////////////////////////////////////////// 1361 1362 /** 1363 * Evaluates core integral \$ G_m(\rho, T) = \left( - \frac{\partial}{\partial T} \right)^n G_0(\rho,T) \f$, 1364 * \f$ G_0(\rho,T) = \int \exp(-\rho |\vec{r} - \vec{P} + \vec{Q}|^2) g(r) \, {\rm d}\vec{r} \f$ 1365 * over a general contracted 1366 * Gaussian geminal \f$ g(r_{12}) = r_{12}^k \sum_i c_i \exp(- a_i r_{12}^2), \quad k = -1, 0, 2 \f$ . 1367 * The integrals are needed in R12/F12 methods with STG-nG correlation factors. 1368 * Specifically, for a correlation factor \f$ f(r_{12}) = \sum_i c_i \exp(- a_i r_{12}^2) \f$ 1369 * integrals with the following kernels are needed: 1370 * <ul> 1371 * <li> \f$ f(r_{12}) \f$ (k=0) </li> 1372 * <li> \f$ f(r_{12}) / r_{12} \f$ (k=-1) </li> 1373 * <li> \f$ f(r_{12})^2 \f$ (k=0, @sa GaussianGmEval::eval ) </li> 1374 * <li> \f$ [f(r_{12}), [\hat{T}_1, f(r_{12})]] \f$ (k=2, @sa GaussianGmEval::eval ) </li> 1375 * </ul> 1376 * 1377 * N.B. ``Asymmetric'' kernels, \f$ f(r_{12}) g(r_{12}) \f$ and 1378 * \f$ [f(r_{12}), [\hat{T}_1, g(r_{12})]] \f$, where f and g are two different geminals, 1379 * can also be handled straightforwardly. 1380 * 1381 * \note for more details see DOI: 10.1039/b605188j 1382 */ 1383 template<typename Real, int k> 1384 struct GaussianGmEval : private detail::CoreEvalScratch<GaussianGmEval<Real,k>> // N.B. empty-base optimization 1385 { 1386 #ifndef LIBINT_USER_DEFINED_REAL 1387 using FmEvalType = libint2::FmEval_Chebyshev7<double>; 1388 #else 1389 using FmEvalType = libint2::FmEval_Reference<scalar_type>; 1390 #endif 1391 1392 /** 1393 * @param[in] mmax the evaluator will be used to compute Gm(T) for 0 <= m <= mmax 1394 */ 1395 GaussianGmEval(int mmax, Real precision) : 1396 detail::CoreEvalScratch<GaussianGmEval<Real, k>>(mmax), mmax_(mmax), 1397 precision_(precision), fm_eval_(), 1398 numbers_(-1,-1,mmax) { 1399 assert(k == -1 || k == 0 || k == 2); 1400 // for k=-1 need to evaluate the Boys function 1401 if (k == -1) { 1402 fm_eval_ = FmEvalType::instance(mmax_, precision_); 1403 } 1404 } 1405 1406 ~GaussianGmEval() { 1407 } 1408 1409 /// Singleton interface allows to manage the lone instance; 1410 /// adjusts max m and precision values as needed in thread-safe fashion 1411 static std::shared_ptr<GaussianGmEval> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) { 1412 assert(mmax >= 0); 1413 assert(precision >= 0); 1414 // thread-safe per C++11 standard [6.7.4] 1415 static auto instance_ = std::make_shared<GaussianGmEval>(mmax, precision); 1416 1417 while (instance_->max_m() < mmax || instance_->precision() > precision) { 1418 static std::mutex mtx; 1419 std::lock_guard<std::mutex> lck(mtx); 1420 if (instance_->max_m() < mmax || instance_->precision() > precision) { 1421 auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision); 1422 instance_ = new_instance; 1423 } 1424 } 1425 1426 return instance_; 1427 } 1428 1429 /// @return the maximum value of m for which the \f$ G_m(\rho, T) \f$ can be computed with this object 1430 int max_m() const { return mmax_; } 1431 /// @return the precision with which this object can compute the Boys function 1432 Real precision() const { return precision_; } 1433 1434 /** computes \f$ G_m(\rho, T) \f$ using downward recursion. 1435 * 1436 * @warning NOT reentrant if \c k == -1 and C++11 is not available 1437 * 1438 * @param[out] Gm array to be filled in with the \f$ Gm(\rho, T) \f$ values, must be at least mmax+1 elements long 1439 * @param[in] rho 1440 * @param[in] T 1441 * @param[in] mmax mmax the maximum value of m for which Boys function will be computed; 1442 * it must be <= the value returned by max_m() (this is not checked) 1443 * @param[in] geminal the Gaussian geminal for which the core integral \f$ Gm(\rho, T) \f$ is computed; a contracted Gaussian geminal is represented as a vector of pairs, each of which specifies the exponent (first) and contraction coefficient (second) of the primitive Gaussian geminal 1444 * @param[in] scr if \c k ==-1 and need this to be reentrant, must provide ptr to 1445 * the per-thread \c libint2::detail::CoreEvalScratch<GaussianGmEval<Real,-1>> object; 1446 * no need to specify \c scr otherwise 1447 */ 1448 template <typename AnyReal> 1449 void eval(Real* Gm, Real rho, Real T, size_t mmax, 1450 const std::vector<std::pair<AnyReal, AnyReal> >& geminal, 1451 void* scr = 0) { 1452 1453 std::fill(Gm, Gm+mmax+1, Real(0)); 1454 1455 const auto sqrt_rho = sqrt(rho); 1456 const auto oo_sqrt_rho = 1/sqrt_rho; 1457 if (k == -1) { 1458 void* _scr = (scr == 0) ? this : scr; 1459 auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr)); 1460 for(int i=1; i<=mmax; i++) { 1461 scratch.r_i[i] = scratch.r_i[i-1] * rho; 1462 } 1463 } 1464 1465 typedef typename std::vector<std::pair<AnyReal, AnyReal> >::const_iterator citer; 1466 const citer gend = geminal.end(); 1467 for(citer i=geminal.begin(); i!= gend; ++i) { 1468 1469 const auto gamma = i->first; 1470 const auto gcoef = i->second; 1471 const auto rhog = rho + gamma; 1472 const auto oorhog = 1/rhog; 1473 1474 const auto gorg = gamma * oorhog; 1475 const auto rorg = rho * oorhog; 1476 const auto sqrt_rho_org = sqrt_rho * oorhog; 1477 const auto sqrt_rhog = sqrt(rhog); 1478 const auto sqrt_rorg = sqrt_rho_org * sqrt_rhog; 1479 1480 /// (ss|g12|ss) 1481 constexpr Real const_SQRTPI_2(0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2 */ 1482 const auto SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg * sqrt_rorg * exp(-gorg*T); 1483 1484 if (k == -1) { 1485 void* _scr = (scr == 0) ? this : scr; 1486 auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr)); 1487 1488 const auto rorgT = rorg * T; 1489 fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax); 1490 1491 #if 1 1492 constexpr Real const_2_SQRTPI(1.12837916709551257389615890312154517); /* 2/sqrt(pi) */ 1493 const auto pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS; 1494 scratch.oorhog_i[0] = pfac; 1495 for(int i=1; i<=mmax; i++) { 1496 scratch.g_i[i] = scratch.g_i[i-1] * gamma; 1497 scratch.oorhog_i[i] = scratch.oorhog_i[i-1] * oorhog; 1498 } 1499 for(int m=0; m<=mmax; m++) { 1500 Real ssss = 0.0; 1501 Real* bcm = numbers_.bc[m]; 1502 for(int n=0; n<=m; n++) { 1503 ssss += bcm[n] * scratch.r_i[n] * scratch.g_i[m-n] * scratch.Fm_[n]; 1504 } 1505 Gm[m] += ssss * scratch.oorhog_i[m]; 1506 } 1507 #endif 1508 } 1509 1510 if (k == 0) { 1511 auto ss_oper_ss_m = SS_K0G12_SS; 1512 Gm[0] += ss_oper_ss_m; 1513 for(int m=1; m<=mmax; ++m) { 1514 ss_oper_ss_m *= gorg; 1515 Gm[m] += ss_oper_ss_m; 1516 } 1517 } 1518 1519 if (k == 2) { 1520 1521 /// (ss|g12*r12^2|ss) 1522 const auto rorgT = rorg * T; 1523 const auto SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog); 1524 const auto SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog); 1525 1526 auto SS_K2G12_SS_gorg_m = SS_K2G12_SS_0 ; 1527 auto SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1; 1528 Gm[0] += SS_K2G12_SS_gorg_m; 1529 for(int m=1; m<=mmax; ++m) { 1530 SS_K2G12_SS_gorg_m *= gorg; 1531 Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1; 1532 SS_K2G12_SS_gorg_m1 *= gorg; 1533 } 1534 } 1535 1536 } 1537 1538 } 1539 1540 private: 1541 int mmax_; 1542 Real precision_; //< absolute precision 1543 std::shared_ptr<const FmEvalType> fm_eval_; 1544 1545 ExpensiveNumbers<Real> numbers_; 1546 }; 1547 1548 template <typename GmEvalFunction> 1549 struct GenericGmEval : private GmEvalFunction { 1550 1551 typedef typename GmEvalFunction::value_type Real; 1552 1553 GenericGmEval(int mmax, Real precision) : GmEvalFunction(mmax, precision), 1554 mmax_(mmax), precision_(precision) {} 1555 1556 static std::shared_ptr<const GenericGmEval> instance(int mmax, Real precision = std::numeric_limits<Real>::epsilon()) { 1557 return std::make_shared<const GenericGmEval>(mmax, precision); 1558 } 1559 1560 template <typename Real, typename... ExtraArgs> 1561 void eval(Real* Gm, Real rho, Real T, int mmax, ExtraArgs... args) const { 1562 assert(mmax <= mmax_); 1563 (GmEvalFunction(*this))(Gm, rho, T, mmax, std::forward<ExtraArgs>(args)...); 1564 } 1565 1566 /// @return the maximum value of m for which the \f$ G_m(\rho, T) \f$ can be computed with this object 1567 int max_m() const { return mmax_; } 1568 /// @return the precision with which this object can compute the Boys function 1569 Real precision() const { return precision_; } 1570 1571 private: 1572 int mmax_; 1573 Real precision_; 1574 }; 1575 1576 // these Gm engines need extra scratch data 1577 namespace os_core_ints { 1578 template <typename Real, int K> struct r12_xx_K_gm_eval; 1579 template <typename Real> struct erfc_coulomb_gm_eval; 1580 } 1581 1582 namespace detail { 1583 /// r12_xx_K_gm_eval<1> needs extra scratch data 1584 template <typename Real> 1585 struct CoreEvalScratch<os_core_ints::r12_xx_K_gm_eval<Real, 1>> { 1586 std::vector<Real> Fm_; 1587 CoreEvalScratch(const CoreEvalScratch&) = default; 1588 CoreEvalScratch(CoreEvalScratch&&) = default; 1589 // need to store Fm(T) for m = 0 .. mmax+1 1590 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+2); } 1591 }; 1592 /// erfc_coulomb_gm_eval needs extra scratch data 1593 template <typename Real> 1594 struct CoreEvalScratch<os_core_ints::erfc_coulomb_gm_eval<Real>> { 1595 std::vector<Real> Fm_; 1596 CoreEvalScratch(const CoreEvalScratch&) = default; 1597 CoreEvalScratch(CoreEvalScratch&&) = default; 1598 // need to store Fm(T) for m = 0 .. mmax 1599 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+1); } 1600 }; 1601 } 1602 1603 /// Obara-Saika core ints code 1604 namespace os_core_ints { 1605 1606 /// core integral evaluator delta function kernels 1607 template <typename Real> 1608 struct delta_gm_eval { 1609 typedef Real value_type; 1610 1611 delta_gm_eval(unsigned int, Real) {} 1612 void operator()(Real* Gm, Real rho, Real T, int mmax) const { 1613 constexpr static auto one_over_two_pi = 1.0 / (2.0 * M_PI); 1614 const auto G0 = exp(-T) * rho * one_over_two_pi; 1615 std::fill(Gm, Gm + mmax + 1, G0); 1616 } 1617 }; 1618 1619 /// core integral evaluator for \f$ r_{12}^K \f$ kernel 1620 /// @tparam K currently supported \c K=1 (use Boys engine directly for \c K=-1) 1621 /// @note need extra scratch for Boys function values when \c K==1, 1622 /// the Gm vector is not long enough for scratch 1623 1624 template <typename Real, int K> 1625 struct r12_xx_K_gm_eval; 1626 1627 template <typename Real> 1628 struct r12_xx_K_gm_eval<Real, 1> 1629 : private detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> { 1630 typedef detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> base_type; 1631 typedef Real value_type; 1632 1633 #ifndef LIBINT_USER_DEFINED_REAL 1634 using FmEvalType = libint2::FmEval_Chebyshev7<double>; 1635 #else 1636 using FmEvalType = libint2::FmEval_Reference<scalar_type>; 1637 #endif 1638 1639 r12_xx_K_gm_eval(unsigned int mmax, Real precision) 1640 : base_type(mmax) { 1641 fm_eval_ = FmEvalType::instance(mmax + 1, precision); 1642 } 1643 void operator()(Real* Gm, Real rho, Real T, int mmax) { 1644 fm_eval_->eval(&base_type::Fm_[0], T, mmax + 1); 1645 auto T_plus_m_plus_one = T + 1.0; 1646 Gm[0] = T_plus_m_plus_one * base_type::Fm_[0] - T * base_type::Fm_[1]; 1647 auto minus_m = -1.0; 1648 T_plus_m_plus_one += 1.0; 1649 for (auto m = 1; m <= mmax; 1650 ++m, minus_m -= 1.0, T_plus_m_plus_one += 1.0) { 1651 Gm[m] = 1652 minus_m * base_type::Fm_[m - 1] + T_plus_m_plus_one * base_type::Fm_[m] - T * base_type::Fm_[m + 1]; 1653 } 1654 } 1655 1656 private: 1657 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K 1658 }; 1659 1660 /// core integral evaluator for \f$ \mathrm{erf}(\omega r) / r \f$ kernel 1661 template <typename Real> 1662 struct erf_coulomb_gm_eval { 1663 typedef Real value_type; 1664 1665 #ifndef LIBINT_USER_DEFINED_REAL 1666 using FmEvalType = libint2::FmEval_Chebyshev7<double>; 1667 #else 1668 using FmEvalType = libint2::FmEval_Reference<scalar_type>; 1669 #endif 1670 1671 erf_coulomb_gm_eval(unsigned int mmax, Real precision) { 1672 fm_eval_ = FmEvalType::instance(mmax, precision); 1673 } 1674 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) const { 1675 if (omega > 0) { 1676 auto omega2 = omega * omega; 1677 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho); 1678 fm_eval_->eval(Gm, T * omega2_over_omega2_plus_rho, 1679 mmax); 1680 1681 using std::sqrt; 1682 auto ooversqrto2prho_exp_2mplus1 = 1683 sqrt(omega2_over_omega2_plus_rho); 1684 for (auto m = 0; m <= mmax; 1685 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) { 1686 Gm[m] *= ooversqrto2prho_exp_2mplus1; 1687 } 1688 } 1689 else { 1690 std::fill(Gm, Gm+mmax+1, Real{0}); 1691 } 1692 } 1693 1694 private: 1695 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K 1696 }; 1697 1698 /// core integral evaluator for \f$ \mathrm{erfc}(\omega r) / r \f$ kernel 1699 /// @note need extra scratch for Boys function values, 1700 /// since need to call Boys engine twice 1701 template <typename Real> 1702 struct erfc_coulomb_gm_eval : private 1703 detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> { 1704 typedef detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> base_type; 1705 typedef Real value_type; 1706 1707 #ifndef LIBINT_USER_DEFINED_REAL 1708 using FmEvalType = libint2::FmEval_Chebyshev7<double>; 1709 #else 1710 using FmEvalType = libint2::FmEval_Reference<scalar_type>; 1711 #endif 1712 1713 erfc_coulomb_gm_eval(unsigned int mmax, Real precision) 1714 : base_type(mmax) { 1715 fm_eval_ = FmEvalType::instance(mmax, precision); 1716 } 1717 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) { 1718 fm_eval_->eval(&base_type::Fm_[0], T, mmax); 1719 std::copy(base_type::Fm_.cbegin(), base_type::Fm_.cbegin() + mmax + 1, Gm); 1720 if (omega > 0) { 1721 auto omega2 = omega * omega; 1722 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho); 1723 fm_eval_->eval(&base_type::Fm_[0], T * omega2_over_omega2_plus_rho, 1724 mmax); 1725 1726 using std::sqrt; 1727 auto ooversqrto2prho_exp_2mplus1 = 1728 sqrt(omega2_over_omega2_plus_rho); 1729 for (auto m = 0; m <= mmax; 1730 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) { 1731 Gm[m] -= ooversqrto2prho_exp_2mplus1 * base_type::Fm_[m]; 1732 } 1733 } 1734 } 1735 1736 private: 1737 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K 1738 }; 1739 1740 } // namespace os_core_ints 1741 1742 /* 1743 * Slater geminal fitting is available only if have LAPACK 1744 */ 1745 #if HAVE_LAPACK 1746 /* 1747 f[x_] := - Exp[-\[Zeta] x] / \[Zeta]; 1748 1749 ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}]; 1750 */ 1751 template <typename Real> 1752 Real 1753 fstg(Real zeta, 1754 Real x) { 1755 return -std::exp(-zeta*x)/zeta; 1756 } 1757 1758 template <typename Real> 1759 Real 1760 fngtg(const std::vector<Real>& cc, 1761 const std::vector<Real>& aa, 1762 Real x) { 1763 Real value = 0.0; 1764 const Real x2 = x * x; 1765 const unsigned int n = cc.size(); 1766 for(unsigned int i=0; i<n; ++i) 1767 value += cc[i] * std::exp(- aa[i] * x2); 1768 return value; 1769 } 1770 1771 // --- weighting functions --- 1772 // L2 error is weighted by ww(x) 1773 // hence error is weighted by sqrt(ww(x)) 1774 template <typename Real> 1775 Real 1776 wwtewklopper(Real x) { 1777 const Real x2 = x * x; 1778 return x2 * std::exp(-2 * x2); 1779 } 1780 template <typename Real> 1781 Real 1782 wwcusp(Real x) { 1783 const Real x2 = x * x; 1784 const Real x6 = x2 * x2 * x2; 1785 return std::exp(-0.005 * x6); 1786 } 1787 // default is Tew-Klopper 1788 template <typename Real> 1789 Real 1790 ww(Real x) { 1791 //return wwtewklopper(x); 1792 return wwcusp(x); 1793 } 1794 1795 template <typename Real> 1796 Real 1797 norm(const std::vector<Real>& vec) { 1798 Real value = 0.0; 1799 const unsigned int n = vec.size(); 1800 for(unsigned int i=0; i<n; ++i) 1801 value += vec[i] * vec[i]; 1802 return value; 1803 } 1804 1805 template <typename Real> 1806 void LinearSolveDamped(const std::vector<Real>& A, 1807 const std::vector<Real>& b, 1808 Real lambda, 1809 std::vector<Real>& x) { 1810 const size_t n = b.size(); 1811 std::vector<Real> Acopy(A); 1812 for(size_t m=0; m<n; ++m) Acopy[m*n + m] *= (1 + lambda); 1813 std::vector<Real> e(b); 1814 1815 //int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0], &e[0], n ); 1816 { 1817 std::vector<int> ipiv(n); 1818 int n = b.size(); 1819 int one = 1; 1820 int info; 1821 dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info); 1822 assert (info == 0); 1823 } 1824 1825 x = e; 1826 } 1827 1828 /** 1829 * computes a least-squares fit of \f$ -exp(-\zeta r_{12})/\zeta = \sum_{i=1}^n c_i exp(-a_i r_{12}^2) \f$ 1830 * on \f$ r_{12} \in [0, x_{\rm max}] \f$ discretized to npts. 1831 * @param[in] n 1832 * @param[in] zeta 1833 * @param[out] geminal 1834 * @param[in] xmin 1835 * @param[in] xmax 1836 * @param[in] npts 1837 */ 1838 template <typename Real> 1839 void stg_ng_fit(unsigned int n, 1840 Real zeta, 1841 std::vector< std::pair<Real, Real> >& geminal, 1842 Real xmin = 0.0, 1843 Real xmax = 10.0, 1844 unsigned int npts = 1001) { 1845 1846 // initial guess 1847 std::vector<Real> cc(n, 1.0); // coefficients 1848 std::vector<Real> aa(n); // exponents 1849 for(unsigned int i=0; i<n; ++i) 1850 aa[i] = std::pow(3.0, (i + 2 - (n + 1)/2.0)); 1851 1852 // first rescale cc for ff[x] to match the norm of f[x] 1853 Real ffnormfac = 0.0; 1854 using std::sqrt; 1855 for(unsigned int i=0; i<n; ++i) 1856 for(unsigned int j=0; j<n; ++j) 1857 ffnormfac += cc[i] * cc[j]/sqrt(aa[i] + aa[j]); 1858 const Real Nf = sqrt(2.0 * zeta) * zeta; 1859 const Real Nff = sqrt(2.0) / (sqrt(ffnormfac) * 1860 sqrt(sqrt(M_PI))); 1861 for(unsigned int i=0; i<n; ++i) cc[i] *= -Nff/Nf; 1862 1863 Real lambda0 = 1000; // damping factor is initially set to 1000, eventually should end up at 0 1864 const Real nu = 3.0; // increase/decrease the damping factor scale it by this 1865 const Real epsilon = 1e-15; // convergence 1866 const unsigned int maxniter = 200; 1867 1868 // grid points on which we will fit 1869 std::vector<Real> xi(npts); 1870 for(unsigned int i=0; i<npts; ++i) xi[i] = xmin + (xmax - xmin)*i/(npts - 1); 1871 1872 std::vector<Real> err(npts); 1873 1874 const size_t nparams = 2*n; // params = expansion coefficients + gaussian exponents 1875 std::vector<Real> J( npts * nparams ); 1876 std::vector<Real> delta(nparams); 1877 1878 // std::cout << "iteration 0" << std::endl; 1879 // for(unsigned int i=0; i<n; ++i) 1880 // std::cout << cc[i] << " " << aa[i] << std::endl; 1881 1882 Real errnormI; 1883 Real errnormIm1 = 1e3; 1884 bool converged = false; 1885 unsigned int iter = 0; 1886 while (!converged && iter < maxniter) { 1887 // std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu << std::endl; 1888 1889 for(unsigned int i=0; i<npts; ++i) { 1890 const Real x = xi[i]; 1891 err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * sqrt(ww(x)); 1892 } 1893 errnormI = norm(err)/sqrt((Real)npts); 1894 1895 // std::cout << "|err|=" << errnormI << std::endl; 1896 converged = std::abs((errnormI - errnormIm1)/errnormIm1) <= epsilon; 1897 if (converged) break; 1898 errnormIm1 = errnormI; 1899 1900 for(unsigned int i=0; i<npts; ++i) { 1901 const Real x2 = xi[i] * xi[i]; 1902 const Real sqrt_ww_x = sqrt(ww(xi[i])); 1903 const unsigned int ioffset = i * nparams; 1904 for(unsigned int j=0; j<n; ++j) 1905 J[ioffset+j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x; 1906 const unsigned int ioffsetn = ioffset+n; 1907 for(unsigned int j=0; j<n; ++j) 1908 J[ioffsetn+j] = - sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2); 1909 } 1910 1911 std::vector<Real> A( nparams * nparams); 1912 for(size_t r=0, rc=0; r<nparams; ++r) { 1913 for(size_t c=0; c<nparams; ++c, ++rc) { 1914 double Arc = 0.0; 1915 for(size_t i=0, ir=r, ic=c; i<npts; ++i, ir+=nparams, ic+=nparams) 1916 Arc += J[ir] * J[ic]; 1917 A[rc] = Arc; 1918 } 1919 } 1920 1921 std::vector<Real> b( nparams ); 1922 for(size_t r=0; r<nparams; ++r) { 1923 Real br = 0.0; 1924 for(size_t i=0, ir=r; i<npts; ++i, ir+=nparams) 1925 br += J[ir] * err[i]; 1926 b[r] = br; 1927 } 1928 1929 // try decreasing damping first 1930 // if not successful try increasing damping until it results in a decrease in the error 1931 lambda0 /= nu; 1932 for(int l=-1; l<1000; ++l) { 1933 1934 LinearSolveDamped(A, b, lambda0, delta ); 1935 1936 std::vector<double> cc_0(cc); for(unsigned int i=0; i<n; ++i) cc_0[i] += delta[i]; 1937 std::vector<double> aa_0(aa); for(unsigned int i=0; i<n; ++i) aa_0[i] += delta[i+n]; 1938 1939 // if any of the exponents are negative the step is too large and need to increase damping 1940 bool step_too_large = false; 1941 for(unsigned int i=0; i<n; ++i) 1942 if (aa_0[i] < 0.0) { 1943 step_too_large = true; 1944 break; 1945 } 1946 if (!step_too_large) { 1947 std::vector<double> err_0(npts); 1948 for(unsigned int i=0; i<npts; ++i) { 1949 const double x = xi[i]; 1950 err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * sqrt(ww(x)); 1951 } 1952 const double errnorm_0 = norm(err_0)/sqrt((double)npts); 1953 if (errnorm_0 < errnormI) { 1954 cc = cc_0; 1955 aa = aa_0; 1956 break; 1957 } 1958 else // step lead to increase of the error -- try dampening a bit more 1959 lambda0 *= nu; 1960 } 1961 else // too large of a step 1962 lambda0 *= nu; 1963 } // done adjusting the damping factor 1964 1965 } // end of iterative minimization 1966 1967 // if reached max # of iterations throw if the error is too terrible 1968 assert(not (iter == maxniter && errnormI > 1e-10)); 1969 1970 for(unsigned int i=0; i<n; ++i) 1971 geminal[i] = std::make_pair(aa[i], cc[i]); 1972 } 1973 #endif 1974 1975 } // end of namespace libint2 1976 1977 #endif // C++ only 1978 #endif // header guard 1979