1 ////////////////////////////////////////////////////////////////////// 2 // This file is distributed under the University of Illinois/NCSA Open Source 3 // License. See LICENSE file in top directory for details. 4 // 5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. 6 // 7 // File developed by: 8 // Miguel A. Morales, moralessilva2@llnl.gov 9 // Lawrence Livermore National Laboratory 10 // 11 // File created by: 12 // Miguel A. Morales, moralessilva2@llnl.gov 13 // Lawrence Livermore National Laboratory 14 //////////////////////////////////////////////////////////////////////////////// 15 16 #ifndef QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_KP3INDEXFACTORIZATION_HPP 17 #define QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_KP3INDEXFACTORIZATION_HPP 18 19 #include <vector> 20 #include <type_traits> 21 #include <mutex> 22 #include <random> 23 24 #include "Configuration.h" 25 #include "AFQMC/config.h" 26 #include "mpi3/shared_communicator.hpp" 27 #include "mpi3/shm/mutex.hpp" 28 #include "multi/array.hpp" 29 #include "multi/array_ref.hpp" 30 #include "AFQMC/Numerics/ma_operations.hpp" 31 32 #include "AFQMC/Utilities/type_conversion.hpp" 33 #include "AFQMC/Utilities/taskgroup.h" 34 #include "AFQMC/Utilities/myTimer.h" 35 36 namespace qmcplusplus 37 { 38 namespace afqmc 39 { 40 class KP3IndexFactorization 41 { 42 using sp_pointer = SPComplexType*; 43 using const_sp_pointer = SPComplexType const*; 44 45 using IVector = boost::multi::array<int, 1>; 46 using CVector = boost::multi::array<ComplexType, 1>; 47 using SpVector = boost::multi::array<SPComplexType, 1>; 48 using CMatrix = boost::multi::array<ComplexType, 2>; 49 using CMatrix_cref = boost::multi::array_cref<ComplexType, 2>; 50 using CMatrix_ref = boost::multi::array_ref<ComplexType, 2>; 51 using CVector_ref = boost::multi::array_ref<ComplexType, 1>; 52 using SpMatrix_cref = boost::multi::array_cref<SPComplexType, 2>; 53 using SpVector_ref = boost::multi::array_ref<SPComplexType, 1>; 54 using SpMatrix_ref = boost::multi::array_ref<SPComplexType, 2>; 55 using C3Tensor = boost::multi::array<ComplexType, 3>; 56 using SpMatrix = boost::multi::array<SPComplexType, 2>; 57 using Sp3Tensor = boost::multi::array<SPComplexType, 3>; 58 using Sp3Tensor_ref = boost::multi::array_ref<SPComplexType, 3>; 59 using Sp4Tensor_ref = boost::multi::array_ref<SPComplexType, 4>; 60 using shmCVector = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>; 61 using shmCMatrix = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>; 62 using shmIMatrix = boost::multi::array<int, 2, shared_allocator<int>>; 63 using shmC3Tensor = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>; 64 using shmSpVector = boost::multi::array<SPComplexType, 1, shared_allocator<SPComplexType>>; 65 using shmSpMatrix = boost::multi::array<SPComplexType, 2, shared_allocator<SPComplexType>>; 66 using shmSp3Tensor = boost::multi::array<SPComplexType, 3, shared_allocator<SPComplexType>>; 67 using communicator = boost::mpi3::shared_communicator; 68 using shared_mutex = boost::mpi3::shm::mutex; 69 using this_t = KP3IndexFactorization; 70 71 public: 72 static const HamiltonianTypes HamOpType = KPFactorized; getHamType() const73 HamiltonianTypes getHamType() const { return HamOpType; } 74 KP3IndexFactorization(communicator & c_,WALKER_TYPES type,IVector && nopk_,IVector && ncholpQ_,IVector && kminus_,shmIMatrix && nelpk_,shmIMatrix && QKToK2_,shmC3Tensor && hij_,shmCMatrix && h1,std::vector<shmSpMatrix> && vik,std::vector<shmSpMatrix> && vak,std::vector<shmSpMatrix> && vbl,IVector && qqm_,shmC3Tensor && vn0_,std::vector<RealType> && gQ_,int nsampleQ_,ValueType e0_,int cv0,int gncv)75 KP3IndexFactorization(communicator& c_, 76 WALKER_TYPES type, 77 IVector&& nopk_, 78 IVector&& ncholpQ_, 79 IVector&& kminus_, 80 shmIMatrix&& nelpk_, 81 shmIMatrix&& QKToK2_, 82 shmC3Tensor&& hij_, 83 shmCMatrix&& h1, 84 std::vector<shmSpMatrix>&& vik, 85 std::vector<shmSpMatrix>&& vak, 86 std::vector<shmSpMatrix>&& vbl, 87 IVector&& qqm_, 88 shmC3Tensor&& vn0_, 89 std::vector<RealType>&& gQ_, 90 int nsampleQ_, 91 ValueType e0_, 92 int cv0, 93 int gncv) 94 : comm(std::addressof(c_)), 95 walker_type(type), 96 global_origin(cv0), 97 global_nCV(gncv), 98 local_nCV(0), 99 E0(e0_), 100 H1(std::move(hij_)), 101 haj(std::move(h1)), 102 nopk(std::move(nopk_)), 103 ncholpQ(std::move(ncholpQ_)), 104 kminus(std::move(kminus_)), 105 nelpk(std::move(nelpk_)), 106 QKToK2(std::move(QKToK2_)), 107 LQKikn(std::move(vik)), 108 LQKank(std::move(vak)), 109 LQKbnl(std::move(vbl)), 110 Qmap(std::move(qqm_)), 111 Q2vbias(Qmap.size()), 112 number_of_symmetric_Q(0), 113 vn0(std::move(vn0_)), 114 nsampleQ(nsampleQ_), 115 gQ(std::move(gQ_)), 116 Qwn({1, 1}, shared_allocator<SPComplexType>{*comm}), 117 generator(), 118 distribution(gQ.begin(), gQ.end()), 119 SM_TMats({1, 1}, shared_allocator<SPComplexType>{*comm}), 120 TMats({1, 1}), 121 mutex(0), 122 EQ(nopk.size() + 2) 123 { 124 mutex.reserve(ncholpQ.size()); 125 for (int nQ = 0; nQ < ncholpQ.size(); nQ++) 126 mutex.emplace_back(std::make_unique<shared_mutex>(*comm)); 127 std::fill_n(EQ.data(), EQ.size(), 0); 128 int nkpts = nopk.size(); 129 // Defines behavior over Q vector: 130 // <0: Ignore (handled by another TG) 131 // 0: Calculate, without rho^+ contribution 132 // >0: Calculate, with rho^+ contribution. LQKbln data located at Qmap[Q]-1 133 number_of_symmetric_Q = 0; 134 local_nCV = 0; 135 std::fill_n(Q2vbias.origin(), nkpts, 0); 136 for (int Q = 0; Q < nkpts; Q++) 137 { 138 if (Q > kminus[Q]) 139 { 140 if (Qmap[kminus[Q]] == 0) 141 { 142 assert(Qmap[Q] == 0); 143 Q2vbias[Q] = 2 * local_nCV; 144 local_nCV += ncholpQ[Q]; 145 } 146 else 147 { 148 assert(Qmap[kminus[Q]] < 0); 149 assert(Qmap[Q] < 0); 150 } 151 } 152 else if (Qmap[Q] >= 0) 153 { 154 Q2vbias[Q] = 2 * local_nCV; 155 local_nCV += ncholpQ[Q]; 156 if (Qmap[Q] > 0) 157 number_of_symmetric_Q++; 158 } 159 } 160 for (int Q = 0; Q < nkpts; Q++) 161 { 162 if (Qmap[Q] > 0) 163 { 164 assert(Q == kminus[Q]); 165 assert(Qmap[Q] <= number_of_symmetric_Q); 166 } 167 } 168 comm->barrier(); 169 } 170 ~KP3IndexFactorization()171 ~KP3IndexFactorization() {} 172 173 KP3IndexFactorization(const KP3IndexFactorization& other) = delete; 174 KP3IndexFactorization& operator=(const KP3IndexFactorization& other) = delete; 175 KP3IndexFactorization(KP3IndexFactorization&& other) = default; 176 KP3IndexFactorization& operator=(KP3IndexFactorization&& other) = default; 177 getOneBodyPropagatorMatrix(TaskGroup_ & TG,boost::multi::array<ComplexType,1> const & vMF)178 boost::multi::array<ComplexType, 2> getOneBodyPropagatorMatrix(TaskGroup_& TG, 179 boost::multi::array<ComplexType, 1> const& vMF) 180 { 181 int nkpts = nopk.size(); 182 int NMO = std::accumulate(nopk.begin(), nopk.end(), 0); 183 int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; 184 185 // making a copy of vMF since it will be modified 186 shmCVector vMF_(iextensions<1u>{vMF.num_elements()}, shared_allocator<ComplexType>{*comm}); 187 comm->barrier(); 188 //if(comm->root()) 189 { 190 using std::copy_n; 191 copy_n(vMF.origin(), vMF.num_elements(), vMF_.origin()); 192 } 193 comm->barrier(); 194 195 boost::multi::array<ComplexType, 2> P0({NMO, NMO}); 196 boost::multi::array_ref<ComplexType, 1> P0D(to_address(P0.origin()), {P0.num_elements()}); 197 std::fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0)); 198 vHS(vMF_, P0D); 199 if (TG.TG().size() > 1) 200 TG.TG().all_reduce_in_place_n(P0D.origin(), P0D.num_elements(), std::plus<>()); 201 202 boost::multi::array<ComplexType, 2> P1({npol * NMO, npol * NMO}); 203 std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0)); 204 205 // add spin-dependent H1 206 for (int K = 0, nk0 = 0; K < nkpts; ++K) 207 { 208 for (int i = 0, I = nk0; i < nopk[K]; i++, I++) 209 { 210 for (int p = 0; p < npol; ++p) 211 P1[p * NMO + I][p * NMO + I] += H1[K][p * nopk[K] + i][p * nopk[K] + i]; 212 for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++) 213 { 214 for (int p = 0; p < npol; ++p) 215 { 216 P1[p * NMO + I][p * NMO + J] += H1[K][p * nopk[K] + i][p * nopk[K] + j]; 217 P1[p * NMO + J][p * NMO + I] += H1[K][p * nopk[K] + j][p * nopk[K] + i]; 218 } 219 } 220 if (walker_type == NONCOLLINEAR) 221 { 222 // offdiagonal piece 223 for (int j = 0, J = nk0; j < nopk[K]; j++, J++) 224 { 225 P1[I][NMO + J] += H1[K][i][nopk[K] + j]; 226 P1[NMO + J][I] += H1[K][nopk[K] + j][i]; 227 } 228 } 229 } 230 nk0 += nopk[K]; 231 } 232 233 // add P0 (diagonal in spin) 234 for (int p = 0; p < npol; ++p) 235 for (int I = 0; I < NMO; I++) 236 for (int J = 0; J < NMO; J++) 237 P1[p * NMO + I][p * NMO + J] += P0[I][J]; 238 239 // add vn0 (diagonal in spin) 240 for (int K = 0, nk0 = 0; K < nkpts; ++K) 241 { 242 for (int i = 0, I = nk0; i < nopk[K]; i++, I++) 243 { 244 for (int p = 0; p < npol; ++p) 245 P1[p * NMO + I][p * NMO + I] += vn0[K][i][i]; 246 for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++) 247 { 248 for (int p = 0; p < npol; ++p) 249 { 250 P1[p * NMO + I][p * NMO + J] += vn0[K][i][j]; 251 P1[p * NMO + J][p * NMO + I] += vn0[K][j][i]; 252 } 253 } 254 } 255 nk0 += nopk[K]; 256 } 257 258 using ma::conj; 259 // symmetrize 260 for (int I = 0; I < npol * NMO; I++) 261 { 262 for (int J = I + 1; J < npol * NMO; J++) 263 { 264 // This is really cutoff dependent!!! 265 #if defined(MIXED_PRECISION) 266 if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5) 267 { 268 #else 269 if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6) 270 { 271 #endif 272 app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; 273 app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << std::endl; 274 //<< H1[K][i][j] << " " 275 //<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl; 276 } 277 P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I])); 278 P1[J][I] = ma::conj(P1[I][J]); 279 } 280 } 281 return P1; 282 } 283 284 template<class Mat, class MatB> 285 void energy(Mat&& E, MatB const& G, int k, bool addH1 = true, bool addEJ = true, bool addEXX = true) 286 { 287 MatB* Kr(nullptr); 288 MatB* Kl(nullptr); 289 energy(E, G, k, Kl, Kr, addH1, addEJ, addEXX); 290 } 291 292 // KEleft and KEright must be in shared memory for this to work correctly 293 template<class Mat, class MatB, class MatC, class MatD> 294 void energy(Mat&& E, 295 MatB const& Gc, 296 int nd, 297 MatC* KEleft, 298 MatD* KEright, 299 bool addH1 = true, 300 bool addEJ = true, 301 bool addEXX = true) 302 { 303 if (nsampleQ > 0) 304 energy_sampleQ(E, Gc, nd, KEleft, KEright, addH1, addEJ, addEXX); 305 else 306 energy_exact(E, Gc, nd, KEleft, KEright, addH1, addEJ, addEXX); 307 } 308 309 // KEleft and KEright must be in shared memory for this to work correctly 310 template<class Mat, class MatB, class MatC, class MatD> 311 void energy_exact(Mat&& E, 312 MatB const& Gc, 313 int nd, 314 MatC* KEleft, 315 MatD* KEright, 316 bool addH1 = true, 317 bool addEJ = true, 318 bool addEXX = true) 319 { 320 int nkpts = nopk.size(); 321 assert(E.size(1) >= 3); 322 assert(nd >= 0 && nd < nelpk.size()); 323 324 int nwalk = Gc.size(1); 325 int nspin = (walker_type == COLLINEAR ? 2 : 1); 326 int npol = (walker_type == NONCOLLINEAR ? 2 : 1); 327 int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); 328 int nmo_max = *std::max_element(nopk.begin(), nopk.end()); 329 int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); 330 int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts); 331 int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end()); 332 int noccb_tot = 0; 333 if (walker_type == COLLINEAR) 334 noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0); 335 int getKr = KEright != nullptr; 336 int getKl = KEleft != nullptr; 337 if (E.size(0) != nwalk || E.size(1) < 3) 338 APP_ABORT( 339 " Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n"); 340 341 size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max * npol); 342 size_t cnt(0); 343 if (addEJ) 344 { 345 #if defined(MIXED_PRECISION) 346 mem_needs += 2 * nwalk * local_nCV; 347 #else 348 if (not getKr) 349 mem_needs += nwalk * local_nCV; 350 if (not getKl) 351 mem_needs += nwalk * local_nCV; 352 #endif 353 } 354 set_shm_buffer(mem_needs); 355 356 // messy 357 SPComplexType *Krptr(nullptr), *Klptr(nullptr); 358 long Knr = 0, Knc = 0; 359 if (addEJ) 360 { 361 Knr = nwalk; 362 Knc = local_nCV; 363 cnt = 0; 364 #if defined(MIXED_PRECISION) 365 if (getKr) 366 { 367 assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); 368 assert(KEright->stride(0) == KEright->size(1)); 369 } 370 #else 371 if (getKr) 372 { 373 assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); 374 assert(KEright->stride(0) == KEright->size(1)); 375 Krptr = to_address(KEright->origin()); 376 } 377 else 378 #endif 379 { 380 Krptr = to_address(SM_TMats.origin()); 381 cnt += nwalk * local_nCV; 382 } 383 #if defined(MIXED_PRECISION) 384 if (getKl) 385 { 386 assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); 387 assert(KEleft->stride(0) == KEleft->size(1)); 388 } 389 #else 390 if (getKl) 391 { 392 assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); 393 assert(KEleft->stride(0) == KEleft->size(1)); 394 Klptr = to_address(KEleft->origin()); 395 } 396 else 397 #endif 398 { 399 Klptr = to_address(SM_TMats.origin()) + cnt; 400 cnt += nwalk * local_nCV; 401 } 402 if (comm->root()) 403 std::fill_n(Krptr, Knr * Knc, SPComplexType(0.0)); 404 if (comm->root()) 405 std::fill_n(Klptr, Knr * Knc, SPComplexType(0.0)); 406 } 407 else if (getKr or getKl) 408 { 409 APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n"); 410 } 411 SpMatrix_ref Kl(Klptr, {long(Knr), long(Knc)}); 412 SpMatrix_ref Kr(Krptr, {long(Knr), long(Knc)}); 413 414 for (int n = 0; n < nwalk; n++) 415 std::fill_n(E[n].origin(), 3, ComplexType(0.)); 416 417 assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); 418 boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot * npol, nmo_tot, nwalk}); 419 boost::multi::array_cref<ComplexType, 3> G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1), 420 {noccb_tot, nmo_tot, nwalk}); 421 422 423 // with yet another mapping, it is possible to reduce the memory usage here! 424 // avoiding for now! 425 Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * npol * nmo_max * nocca_max}); 426 GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max); 427 comm->barrier(); 428 429 // one-body contribution 430 // haj[ndet*nkpts][nocc*nmo] 431 if (addH1) 432 { 433 int na = 0, nk = 0, nb = 0; 434 for (int n = 0; n < nwalk; n++) 435 E[n][0] = E0; 436 for (int K = 0; K < nkpts; ++K) 437 { 438 #ifdef MIXED_PRECISION 439 // must use Gc since GKK is is SP 440 boost::multi::array_ref<ComplexType, 3> haj_K(to_address(haj[nd * nkpts + K].origin()), 441 {nelpk[nd][K], npol, nopk[K]}); 442 for (int a = 0; a < nelpk[nd][K]; ++a) 443 for (int pol = 0; pol < npol; ++pol) 444 ma::product(ComplexType(1.), ma::T(G3Da[(na + a) * npol + pol].sliced(nk, nk + nopk[K])), haj_K[a][pol], 445 ComplexType(1.), E(E.extension(0), 0)); 446 na += nelpk[nd][K]; 447 if (walker_type == COLLINEAR) 448 { 449 boost::multi::array_ref<ComplexType, 2> haj_Kb(haj_K.origin() + haj_K.num_elements(), 450 {nelpk[nd][nkpts + K], nopk[K]}); 451 for (int b = 0; b < nelpk[nd][nkpts + K]; ++b) 452 ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b], ComplexType(1.), 453 E(E.extension(0), 0)); 454 nb += nelpk[nd][nkpts + K]; 455 } 456 nk += nopk[K]; 457 #else 458 // use GKK 459 nk = nopk[K]; 460 { 461 na = nelpk[nd][K]; 462 CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * npol * nk}); 463 SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * npol * nk}); 464 ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0)); 465 } 466 if (walker_type == COLLINEAR) 467 { 468 na = nelpk[nd][nkpts + K]; 469 CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()) + nelpk[nd][K] * nk, {na * nk}); 470 SpMatrix_ref Gaj(to_address(GKK[1][K][K].origin()), {nwalk, na * nk}); 471 ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0)); 472 } 473 #endif 474 } 475 } 476 477 if (addEXX) 478 { 479 size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + 2 * nchol_max * nwalk; 480 if (TMats.num_elements() < local_memory_needs) 481 TMats.reextent({local_memory_needs, 1}); 482 cnt = 0; 483 SpMatrix_ref Kr_local(TMats.origin(), {nwalk, nchol_max}); 484 cnt += Kr_local.num_elements(); 485 SpMatrix_ref Kl_local(TMats.origin() + cnt, {nwalk, nchol_max}); 486 cnt += Kl_local.num_elements(); 487 std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0)); 488 std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0)); 489 SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0); 490 size_t nqk = 1; 491 for (int Q = 0; Q < nkpts; ++Q) 492 { 493 if (Qmap[Q] < 0) 494 continue; 495 bool haveKE = false; 496 for (int Ka = 0; Ka < nkpts; ++Ka) 497 { 498 int K0 = ((Qmap[Q] > 0) ? 0 : Ka); 499 for (int Kb = K0; Kb < nkpts; ++Kb) 500 { 501 if ((nqk++) % comm->size() == comm->rank()) 502 { 503 int nchol = ncholpQ[Q]; 504 int Qm = kminus[Q]; 505 int Kl = QKToK2[Qm][Kb]; 506 int Kk = QKToK2[Q][Ka]; 507 int nl = nopk[Kl]; 508 int nb = nelpk[nd][Kb]; 509 int na = nelpk[nd][Ka]; 510 int nk = nopk[Kk]; 511 512 SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na, npol * nl}); 513 SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb, npol * nk}); 514 SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, npol * nk}); 515 auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin())); 516 if (Qmap[Q] > 0) 517 bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin()); 518 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, npol * nl}); 519 520 SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol}); 521 Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol}); 522 SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol}); 523 Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol}); 524 525 if (na > 0 && nb > 0) 526 ma::product(Gwal, ma::T(Lbnl), Twabn); 527 if (na > 0 && nb > 0) 528 ma::product(Gwbk, ma::T(Lank), Twban); 529 530 for (int n = 0; n < nwalk; ++n) 531 { 532 SPComplexType E_(0.0); 533 for (int a = 0; a < na; ++a) 534 for (int b = 0; b < nb; ++b) 535 E_ += ma::dot(T4Dwabn[n][a][b], T4Dwban[n][b][a]); 536 if (Qmap[Q] > 0 || Ka == Kb) 537 E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_); 538 else 539 E[n][1] -= static_cast<ComplexType>(scl * E_); 540 } 541 542 if (addEJ && Ka == Kb) 543 { 544 haveKE = true; 545 for (int n = 0; n < nwalk; ++n) 546 for (int a = 0; a < na; ++a) 547 { 548 ma::axpy(SPComplexType(1.0), T4Dwban[n][a][a], Kl_local[n].sliced(0, nchol)); 549 ma::axpy(SPComplexType(1.0), T4Dwabn[n][a][a], Kr_local[n].sliced(0, nchol)); 550 } 551 } 552 553 } // if 554 555 if (walker_type == COLLINEAR) 556 { 557 if ((nqk++) % comm->size() == comm->rank()) 558 { 559 int nchol = ncholpQ[Q]; 560 int Qm = kminus[Q]; 561 int Kl = QKToK2[Qm][Kb]; 562 int Kk = QKToK2[Q][Ka]; 563 int nl = nopk[Kl]; 564 int nb = nelpk[nd][nkpts + Kb]; 565 int na = nelpk[nd][nkpts + Ka]; 566 int nk = nopk[Kk]; 567 568 SpMatrix_ref Gwal(GKK[1][Ka][Kl].origin(), {nwalk * na, nl}); 569 SpMatrix_ref Gwbk(GKK[1][Kb][Kk].origin(), {nwalk * nb, nk}); 570 SpMatrix_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na * nchol, nk}); 571 auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Kb].origin())); 572 if (Qmap[Q] > 0) 573 bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin()); 574 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl}); 575 576 SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol}); 577 Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol}); 578 SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol}); 579 Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol}); 580 581 if (na > 0 && nb > 0) 582 ma::product(Gwal, ma::T(Lbnl), Twabn); 583 if (na > 0 && nb > 0) 584 ma::product(Gwbk, ma::T(Lank), Twban); 585 586 for (int n = 0; n < nwalk; ++n) 587 { 588 SPComplexType E_(0.0); 589 for (int a = 0; a < na; ++a) 590 for (int b = 0; b < nb; ++b) 591 E_ += ma::dot(T4Dwabn[n][a][b], T4Dwban[n][b][a]); 592 if (Qmap[Q] > 0 || Ka == Kb) 593 E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_); 594 else 595 E[n][1] -= static_cast<ComplexType>(scl * E_); 596 } 597 598 if (addEJ && Ka == Kb) 599 { 600 haveKE = true; 601 for (int n = 0; n < nwalk; ++n) 602 for (int a = 0; a < na; ++a) 603 { 604 ma::axpy(SPComplexType(1.0), T4Dwban[n][a][a], Kl_local[n].sliced(0, nchol)); 605 ma::axpy(SPComplexType(1.0), T4Dwabn[n][a][a], Kr_local[n].sliced(0, nchol)); 606 } 607 } 608 609 } // if 610 } // COLLINEAR 611 } // Kb 612 } // Ka 613 if (addEJ && haveKE) 614 { 615 std::lock_guard<shared_mutex> guard(*mutex[Q]); 616 int nc0 = Q2vbias[Q] / 2; //std::accumulate(ncholpQ.begin(),ncholpQ.begin()+Q,0); 617 for (int n = 0; n < nwalk; n++) 618 { 619 ma::axpy(SPComplexType(1.0), Kr_local[n].sliced(0, ncholpQ[Q]), Kr[n].sliced(nc0, nc0 + ncholpQ[Q])); 620 ma::axpy(SPComplexType(1.0), Kl_local[n].sliced(0, ncholpQ[Q]), Kl[n].sliced(nc0, nc0 + ncholpQ[Q])); 621 } 622 } // to release the lock 623 if (addEJ && haveKE) 624 { 625 std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0)); 626 std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0)); 627 } 628 } // Q 629 } 630 631 if (addEJ) 632 { 633 if (not addEXX) 634 { 635 // calculate Kr 636 APP_ABORT(" Error: Finish addEJ and not addEXX"); 637 } 638 comm->barrier(); 639 size_t nqk = 0; 640 SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0); 641 for (int n = 0; n < nwalk; ++n) 642 { 643 for (int Q = 0; Q < nkpts; ++Q) 644 { // momentum conservation index 645 if (Qmap[Q] < 0) 646 continue; 647 if ((nqk++) % comm->size() == comm->rank()) 648 { 649 int nc0 = Q2vbias[Q] / 2; //std::accumulate(ncholpQ.begin(),ncholpQ.begin()+Q,0); 650 E[n][2] += 0.5 * 651 static_cast<ComplexType>(scl * scl * 652 ma::dot(Kl[n]({nc0, nc0 + ncholpQ[Q]}), Kr[n]({nc0, nc0 + ncholpQ[Q]}))); 653 } 654 } 655 } 656 #if defined(MIXED_PRECISION) 657 if (getKl) 658 { 659 size_t i0, iN; 660 std::tie(i0, iN) = 661 FairDivideBoundary(size_t(comm->rank()), size_t(KEleft->num_elements()), size_t(comm->size())); 662 copy_n_cast(Klptr + i0, iN - i0, to_address(KEleft->origin()) + i0); 663 } 664 if (getKr) 665 { 666 size_t i0, iN; 667 std::tie(i0, iN) = 668 FairDivideBoundary(size_t(comm->rank()), size_t(KEright->num_elements()), size_t(comm->size())); 669 copy_n_cast(Krptr + i0, iN - i0, to_address(KEright->origin()) + i0); 670 } 671 #endif 672 comm->barrier(); 673 } 674 } 675 676 // KEleft and KEright must be in shared memory for this to work correctly 677 template<class Mat, class MatB, class MatC, class MatD> 678 void energy_sampleQ(Mat&& E, 679 MatB const& Gc, 680 int nd, 681 MatC* KEleft, 682 MatD* KEright, 683 bool addH1 = true, 684 bool addEJ = true, 685 bool addEXX = true) 686 { 687 APP_ABORT(" Error: Incomplete implementation. \n"); 688 // need to finish modifications for distribution of Q 689 690 int nkpts = nopk.size(); 691 assert(E.size(1) >= 3); 692 assert(nd >= 0 && nd < nelpk.size()); 693 694 int nwalk = Gc.size(1); 695 int nspin = (walker_type == COLLINEAR ? 2 : 1); 696 int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); 697 int nmo_max = *std::max_element(nopk.begin(), nopk.end()); 698 int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); 699 int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts); 700 int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end()); 701 int noccb_tot = 0; 702 if (walker_type == COLLINEAR) 703 noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0); 704 int getKr = KEright != nullptr; 705 int getKl = KEleft != nullptr; 706 if (E.size(0) != nwalk || E.size(1) < 3) 707 APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions\n"); 708 709 size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max); 710 size_t cnt(0); 711 if (addEJ) 712 { 713 #if defined(MIXED_PRECISION) 714 mem_needs += 2 * nwalk * local_nCV; 715 #else 716 if (not getKr) 717 mem_needs += nwalk * local_nCV; 718 if (not getKl) 719 mem_needs += nwalk * local_nCV; 720 #endif 721 } 722 set_shm_buffer(mem_needs); 723 724 // messy 725 SPComplexType *Krptr(nullptr), *Klptr(nullptr); 726 long Knr = 0, Knc = 0; 727 if (addEJ) 728 { 729 Knr = nwalk; 730 Knc = local_nCV; 731 cnt = 0; 732 #if defined(MIXED_PRECISION) 733 if (getKr) 734 { 735 assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); 736 assert(KEright->stride(0) == KEright->size(1)); 737 } 738 #else 739 if (getKr) 740 { 741 assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); 742 assert(KEright->stride(0) == KEright->size(1)); 743 Krptr = to_address(KEright->origin()); 744 } 745 else 746 #endif 747 { 748 Krptr = to_address(SM_TMats.origin()); 749 cnt += nwalk * local_nCV; 750 } 751 #if defined(MIXED_PRECISION) 752 if (getKl) 753 { 754 assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); 755 assert(KEleft->stride(0) == KEleft->size(1)); 756 } 757 #else 758 if (getKl) 759 { 760 assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); 761 assert(KEleft->stride(0) == KEleft->size(1)); 762 Klptr = to_address(KEleft->origin()); 763 } 764 else 765 #endif 766 { 767 Klptr = to_address(SM_TMats.origin()) + cnt; 768 cnt += nwalk * local_nCV; 769 } 770 if (comm->root()) 771 std::fill_n(Krptr, Knr * Knc, SPComplexType(0.0)); 772 if (comm->root()) 773 std::fill_n(Klptr, Knr * Knc, SPComplexType(0.0)); 774 } 775 else if (getKr or getKl) 776 { 777 APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n"); 778 } 779 SpMatrix_ref Kl(Klptr, {long(Knr), long(Knc)}); 780 SpMatrix_ref Kr(Krptr, {long(Knr), long(Knc)}); 781 782 for (int n = 0; n < nwalk; n++) 783 std::fill_n(E[n].origin(), 3, ComplexType(0.)); 784 785 assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot); 786 boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot, nmo_tot, nwalk}); 787 boost::multi::array_cref<ComplexType, 3> G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1), 788 {noccb_tot, nmo_tot, nwalk}); 789 790 Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * nmo_max * nocca_max}); 791 cnt += GKK.num_elements(); 792 GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max); 793 comm->barrier(); 794 795 // one-body contribution 796 // haj[ndet*nkpts][nocc*nmo] 797 // not parallelized for now, since it would require customization of Wfn 798 if (addH1) 799 { 800 // must use Gc since GKK is is SP 801 int na = 0, nk = 0, nb = 0; 802 for (int n = 0; n < nwalk; n++) 803 E[n][0] = E0; 804 for (int K = 0; K < nkpts; ++K) 805 { 806 #ifdef MIXED_PRECISION 807 boost::multi::array_ref<ComplexType, 2> haj_K(to_address(haj[nd * nkpts + K].origin()), 808 {nelpk[nd][K], nopk[K]}); 809 for (int a = 0; a < nelpk[nd][K]; ++a) 810 ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a], ComplexType(1.), 811 E(E.extension(0), 0)); 812 na += nelpk[nd][K]; 813 if (walker_type == COLLINEAR) 814 { 815 boost::multi::array_ref<ComplexType, 2> haj_Kb(haj_K.origin() + haj_K.num_elements(), 816 {nelpk[nd][nkpts + K], nopk[K]}); 817 for (int b = 0; b < nelpk[nd][nkpts + K]; ++b) 818 ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b], ComplexType(1.), 819 E(E.extension(0), 0)); 820 nb += nelpk[nd][nkpts + K]; 821 } 822 nk += nopk[K]; 823 #else 824 nk = nopk[K]; 825 { 826 na = nelpk[nd][K]; 827 CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * nk}); 828 SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * nk}); 829 ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0)); 830 } 831 if (walker_type == COLLINEAR) 832 { 833 na = nelpk[nd][nkpts + K]; 834 CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()) + nelpk[nd][K] * nk, {na * nk}); 835 SpMatrix_ref Gaj(to_address(GKK[1][K][K].origin()), {nwalk, na * nk}); 836 ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0)); 837 } 838 #endif 839 } 840 } 841 842 // move calculation of H1 here 843 // NOTE: For CLOSED/NONCOLLINEAR, can do all walkers simultaneously to improve perf. of GEMM 844 // Not sure how to do it for COLLINEAR. 845 if (addEXX) 846 { 847 if (Qwn.size(0) != nwalk || Qwn.size(1) != nsampleQ) 848 Qwn.reextent({nwalk, nsampleQ}); 849 comm->barrier(); 850 if (comm->root()) 851 { 852 for (int n = 0; n < nwalk; ++n) 853 for (int nQ = 0; nQ < nsampleQ; ++nQ) 854 { 855 Qwn[n][nQ] = distribution(generator); 856 /* 857 RealType drand = distribution(generator); 858 RealType s(0.0); 859 bool found=false; 860 for(int Q=0; Q<nkpts; Q++) { 861 s += gQ[Q]; 862 if( drand < s ) { 863 Qwn[n][nQ] = Q; 864 found=true; 865 break; 866 } 867 } 868 if(not found) 869 APP_ABORT(" Error: sampleQ Qwn. \n"); 870 */ 871 } 872 } 873 comm->barrier(); 874 875 size_t local_memory_needs = 2 * nocca_max * nocca_max * nchol_max; 876 if (TMats.num_elements() < local_memory_needs) 877 TMats.reextent({local_memory_needs, 1}); 878 size_t local_cnt = 0; 879 SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0); 880 size_t nqk = 1; 881 for (int n = 0; n < nwalk; ++n) 882 { 883 for (int nQ = 0; nQ < nsampleQ; ++nQ) 884 { 885 int Q = Qwn[n][nQ]; 886 for (int Ka = 0; Ka < nkpts; ++Ka) 887 { 888 for (int Kb = 0; Kb < nkpts; ++Kb) 889 { 890 if ((nqk++) % comm->size() == comm->rank()) 891 { 892 int nchol = ncholpQ[Q]; 893 int Qm = kminus[Q]; 894 int Kl = QKToK2[Qm][Kb]; 895 int Kk = QKToK2[Q][Ka]; 896 int nl = nopk[Kl]; 897 int nb = nelpk[nd][Kb]; 898 int na = nelpk[nd][Ka]; 899 int nk = nopk[Kk]; 900 901 SpMatrix_ref Gal(GKK[0][Ka][Kl].origin() + n * na * nl, {na, nl}); 902 SpMatrix_ref Gbk(GKK[0][Kb][Kk].origin() + n * nb * nk, {nb, nk}); 903 SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, nk}); 904 auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin())); 905 if (Q == Qm) 906 bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin()); 907 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl}); 908 909 SpMatrix_ref Tban(TMats.origin() + local_cnt, {nb, na * nchol}); 910 Sp3Tensor_ref T3Dban(TMats.origin() + local_cnt, {nb, na, nchol}); 911 SpMatrix_ref Tabn(Tban.origin() + Tban.num_elements(), {na, nb * nchol}); 912 Sp3Tensor_ref T3Dabn(Tban.origin() + Tban.num_elements(), {na, nb, nchol}); 913 914 if (na > 0 && nb > 0) 915 ma::product(Gal, ma::T(Lbnl), Tabn); 916 if (na > 0 && nb > 0) 917 ma::product(Gbk, ma::T(Lank), Tban); 918 919 SPComplexType E_(0.0); 920 for (int a = 0; a < na; ++a) 921 for (int b = 0; b < nb; ++b) 922 E_ += ma::dot(T3Dabn[a][b], T3Dban[b][a]); 923 E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_) / gQ[Q] / double(nsampleQ); 924 925 } // if 926 927 if (walker_type == COLLINEAR) 928 { 929 if ((nqk++) % comm->size() == comm->rank()) 930 { 931 int nchol = ncholpQ[Q]; 932 int Qm = kminus[Q]; 933 int Kl = QKToK2[Qm][Kb]; 934 int Kk = QKToK2[Q][Ka]; 935 int nl = nopk[Kl]; 936 int nb = nelpk[nd][nkpts + Kb]; 937 int na = nelpk[nd][nkpts + Ka]; 938 int nk = nopk[Kk]; 939 940 SpMatrix_ref Gal(GKK[1][Ka][Kl].origin() + n * na * nl, {na, nl}); 941 SpMatrix_ref Gbk(GKK[1][Kb][Kk].origin() + n * nb * nk, {nb, nk}); 942 SpMatrix_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na * nchol, nk}); 943 auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Kb].origin())); 944 if (Q == Qm) 945 bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin()); 946 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl}); 947 948 SpMatrix_ref Tban(TMats.origin() + local_cnt, {nb, na * nchol}); 949 Sp3Tensor_ref T3Dban(TMats.origin() + local_cnt, {nb, na, nchol}); 950 SpMatrix_ref Tabn(Tban.origin() + Tban.num_elements(), {na, nb * nchol}); 951 Sp3Tensor_ref T3Dabn(Tban.origin() + Tban.num_elements(), {na, nb, nchol}); 952 953 if (na > 0 && nb > 0) 954 ma::product(Gal, ma::T(Lbnl), Tabn); 955 if (na > 0 && nb > 0) 956 ma::product(Gbk, ma::T(Lank), Tban); 957 958 SPComplexType E_(0.0); 959 for (int a = 0; a < na; ++a) 960 for (int b = 0; b < nb; ++b) 961 E_ += ma::dot(T3Dabn[a][b], T3Dban[b][a]); 962 E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_) / gQ[Q] / double(nsampleQ); 963 964 } // if 965 } // COLLINEAR 966 } // Kb 967 } // Ka 968 } // nQ 969 } // n 970 } 971 972 if (addEJ) 973 { 974 size_t local_memory_needs = 2 * nchol_max * nwalk; 975 if (TMats.num_elements() < local_memory_needs) 976 TMats.reextent({local_memory_needs, 1}); 977 cnt = 0; 978 SpMatrix_ref Kr_local(TMats.origin(), {nwalk, nchol_max}); 979 cnt += Kr_local.num_elements(); 980 SpMatrix_ref Kl_local(TMats.origin() + cnt, {nwalk, nchol_max}); 981 cnt += Kl_local.num_elements(); 982 std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0)); 983 std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0)); 984 size_t nqk = 1; 985 for (int Q = 0; Q < nkpts; ++Q) 986 { 987 bool haveKE = false; 988 for (int Ka = 0; Ka < nkpts; ++Ka) 989 { 990 if ((nqk++) % comm->size() == comm->rank()) 991 { 992 haveKE = true; 993 int nchol = ncholpQ[Q]; 994 int Qm = kminus[Q]; 995 int Kl = QKToK2[Qm][Ka]; 996 int Kk = QKToK2[Q][Ka]; 997 int nl = nopk[Kl]; 998 int na = nelpk[nd][Ka]; 999 int nk = nopk[Kk]; 1000 1001 Sp3Tensor_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk, na, nl}); 1002 Sp3Tensor_ref Gwbk(GKK[0][Ka][Kk].origin(), {nwalk, na, nk}); 1003 Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na, nchol, nk}); 1004 SPComplexType* bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Ka].origin())); 1005 if (Q == Qm) 1006 bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Ka].origin()); 1007 Sp3Tensor_ref Lbnl(bnl_ptr, {na, nchol, nl}); 1008 1009 // Twan = sum_l G[w][a][l] L[a][n][l] 1010 for (int n = 0; n < nwalk; ++n) 1011 for (int a = 0; a < na; ++a) 1012 ma::product(SPComplexType(1.0), Lbnl[a], Gwal[n][a], SPComplexType(1.0), Kl_local[n]); 1013 for (int n = 0; n < nwalk; ++n) 1014 for (int a = 0; a < na; ++a) 1015 ma::product(SPComplexType(1.0), Lank[a], Gwbk[n][a], SPComplexType(1.0), Kr_local[n]); 1016 } // if 1017 1018 if (walker_type == COLLINEAR) 1019 { 1020 if ((nqk++) % comm->size() == comm->rank()) 1021 { 1022 haveKE = true; 1023 int nchol = ncholpQ[Q]; 1024 int Qm = kminus[Q]; 1025 int Kl = QKToK2[Qm][Ka]; 1026 int Kk = QKToK2[Q][Ka]; 1027 int nl = nopk[Kl]; 1028 int na = nelpk[nd][nkpts + Ka]; 1029 int nk = nopk[Kk]; 1030 1031 Sp3Tensor_ref Gwal(GKK[1][Ka][Kl].origin(), {nwalk, na, nl}); 1032 Sp3Tensor_ref Gwbk(GKK[1][Ka][Kk].origin(), {nwalk, na, nk}); 1033 Sp3Tensor_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na, nchol, nk}); 1034 auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Ka].origin())); 1035 if (Q == Qm) 1036 bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Ka].origin()); 1037 Sp3Tensor_ref Lbnl(bnl_ptr, {na, nchol, nl}); 1038 1039 // Twan = sum_l G[w][a][l] L[a][n][l] 1040 for (int n = 0; n < nwalk; ++n) 1041 for (int a = 0; a < na; ++a) 1042 ma::product(SPComplexType(1.0), Lbnl[a], Gwal[n][a], SPComplexType(1.0), Kl_local[n]); 1043 for (int n = 0; n < nwalk; ++n) 1044 for (int a = 0; a < na; ++a) 1045 ma::product(SPComplexType(1.0), Lank[a], Gwbk[n][a], SPComplexType(1.0), Kr_local[n]); 1046 1047 } // if 1048 } // COLLINEAR 1049 } // Ka 1050 if (haveKE) 1051 { 1052 std::lock_guard<shared_mutex> guard(*mutex[Q]); 1053 int nc0 = std::accumulate(ncholpQ.begin(), ncholpQ.begin() + Q, 0); 1054 for (int n = 0; n < nwalk; n++) 1055 { 1056 ma::axpy(SPComplexType(1.0), Kr_local[n].sliced(0, ncholpQ[Q]), Kr[n].sliced(nc0, nc0 + ncholpQ[Q])); 1057 ma::axpy(SPComplexType(1.0), Kl_local[n].sliced(0, ncholpQ[Q]), Kl[n].sliced(nc0, nc0 + ncholpQ[Q])); 1058 } 1059 } // to release the lock 1060 if (haveKE) 1061 { 1062 std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0)); 1063 std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0)); 1064 } 1065 } // Q 1066 comm->barrier(); 1067 nqk = 0; 1068 SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0); 1069 for (int n = 0; n < nwalk; ++n) 1070 { 1071 for (int Q = 0; Q < nkpts; ++Q) 1072 { // momentum conservation index 1073 if ((nqk++) % comm->size() == comm->rank()) 1074 { 1075 int nc0 = std::accumulate(ncholpQ.begin(), ncholpQ.begin() + Q, 0); 1076 E[n][2] += 0.5 * 1077 static_cast<ComplexType>(scl * scl * 1078 ma::dot(Kl[n]({nc0, nc0 + ncholpQ[Q]}), Kr[n]({nc0, nc0 + ncholpQ[Q]}))); 1079 } 1080 } 1081 } 1082 #if defined(MIXED_PRECISION) 1083 if (getKl) 1084 { 1085 size_t i0, iN; 1086 std::tie(i0, iN) = 1087 FairDivideBoundary(size_t(comm->rank()), size_t(KEleft->num_elements()), size_t(comm->size())); 1088 copy_n_cast(Klptr + i0, iN - i0, to_address(KEleft->origin()) + i0); 1089 } 1090 if (getKr) 1091 { 1092 size_t i0, iN; 1093 std::tie(i0, iN) = 1094 FairDivideBoundary(size_t(comm->rank()), size_t(KEright->num_elements()), size_t(comm->size())); 1095 copy_n_cast(Krptr + i0, iN - i0, to_address(KEright->origin()) + i0); 1096 } 1097 comm->barrier(); 1098 #endif 1099 } 1100 } 1101 1102 template<class... Args> 1103 void fast_energy(Args&&... args) 1104 { 1105 APP_ABORT(" Error: fast_energy not implemented in KP3IndexFactorization. \n"); 1106 } 1107 1108 template<class MatA, 1109 class MatB, 1110 typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 1)>, 1111 typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 1)>, 1112 typename = void> 1113 void vHS(MatA& X, MatB&& v, double a = 1., double c = 0.) 1114 { 1115 using BType = typename std::decay<MatB>::type::element; 1116 using AType = typename std::decay<MatA>::type::element; 1117 boost::multi::array_ref<BType, 2> v_(to_address(v.origin()), {1, v.size(0)}); 1118 boost::multi::array_ref<AType, 2> X_(to_address(X.origin()), {X.size(0), 1}); 1119 return vHS(X_, v_, a, c); 1120 } 1121 1122 template<class MatA, 1123 class MatB, 1124 typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 2)>, 1125 typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 2)>> 1126 void vHS(MatA& Xw, MatB&& v, double a = 1., double c = 0.) 1127 { 1128 int nkpts = nopk.size(); 1129 int nwalk = Xw.size(1); 1130 assert(v.size(0) == nwalk); 1131 int nspin = (walker_type == COLLINEAR ? 2 : 1); 1132 int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); 1133 int nmo_max = *std::max_element(nopk.begin(), nopk.end()); 1134 int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end()); 1135 assert(Xw.num_elements() == nwalk * 2 * local_nCV); 1136 assert(v.num_elements() == nwalk * nmo_tot * nmo_tot); 1137 SPComplexType one(1.0, 0.0); 1138 SPComplexType im(0.0, 1.0); 1139 SPComplexType halfa(0.5 * a, 0.0); 1140 size_t local_memory_needs = nmo_max * nmo_max * nwalk; 1141 if (TMats.num_elements() < local_memory_needs) 1142 TMats.reextent({local_memory_needs, 1}); 1143 1144 using vType = typename std::decay<MatB>::type::element; 1145 boost::multi::array_ref<vType, 3> v3D(to_address(v.origin()), {nwalk, nmo_tot, nmo_tot}); 1146 1147 sp_pointer Xptr(nullptr); 1148 // I WANT C++17!!!!!! 1149 using XType = typename std::decay<MatA>::type::element; 1150 if (std::is_same<SPComplexType, XType>::value) 1151 { 1152 Xptr = reinterpret_cast<sp_pointer>(to_address(Xw.origin())); 1153 } 1154 else 1155 { 1156 size_t mem_needs = Xw.num_elements(); 1157 set_shm_buffer(mem_needs); 1158 Xptr = to_address(SM_TMats.origin()); 1159 { 1160 size_t i0, iN; 1161 std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(Xw.num_elements()), size_t(comm->size())); 1162 copy_n_cast(to_address(Xw.origin()) + i0, iN - i0, Xptr + i0); 1163 } 1164 comm->barrier(); 1165 } 1166 SpMatrix_ref X(Xptr, Xw.extensions()); 1167 1168 // "rotate" X 1169 // XIJ = 0.5*a*(Xn+ -i*Xn-), XJI = 0.5*a*(Xn+ +i*Xn-) 1170 for (int Q = 0; Q < nkpts; ++Q) 1171 { 1172 if (Qmap[Q] < 0) 1173 continue; 1174 int nq = Q2vbias[Q]; 1175 int nc0, ncN; 1176 std::tie(nc0, ncN) = FairDivideBoundary(comm->rank(), ncholpQ[Q], comm->size()); 1177 auto Xnp = to_address(X[nq + nc0].origin()); 1178 auto Xnm = to_address(X[nq + ncholpQ[Q] + nc0].origin()); 1179 for (int n = nc0; n < ncN; ++n) 1180 { 1181 for (int nw = 0; nw < nwalk; ++nw, ++Xnp, ++Xnm) 1182 { 1183 SPComplexType Xnp_ = halfa * ((*Xnp) - im * (*Xnm)); 1184 *Xnm = halfa * ((*Xnp) + im * (*Xnm)); 1185 *Xnp = Xnp_; 1186 } 1187 } 1188 } 1189 comm->barrier(); 1190 // then combine Q/(-Q) pieces if Q != -Q 1191 // X(Q)np = (X(Q)np + X(-Q)nm) 1192 for (int Q = 0; Q < nkpts; ++Q) 1193 { 1194 if (Qmap[Q] == 0) 1195 { 1196 int Qm = kminus[Q]; 1197 int nq = Q2vbias[Q]; 1198 int nqm0 = Q2vbias[Qm]; 1199 int nc0, ncN; 1200 std::tie(nc0, ncN) = FairDivideBoundary(comm->rank(), ncholpQ[Q], comm->size()); 1201 auto Xnp = to_address(X[nq + nc0].origin()); 1202 auto Xnm = to_address(X[nqm0 + ncholpQ[Q] + nc0].origin()); 1203 for (int n = nc0; n < ncN; ++n) 1204 for (int nw = 0; nw < nwalk; ++nw, ++Xnp, ++Xnm) 1205 *Xnp = ((*Xnp) + (*Xnm)); 1206 } 1207 } 1208 // scale v by 'c': assuming contiguous data 1209 { 1210 size_t i0, iN; 1211 std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(v.num_elements()), size_t(comm->size())); 1212 auto v_ = to_address(v.origin()) + i0; 1213 for (size_t i = i0; i < iN; ++i, ++v_) 1214 *v_ *= c; 1215 } 1216 comm->barrier(); 1217 1218 using ma::H; 1219 using ma::T; 1220 size_t nqk = 0; 1221 for (int Q = 0; Q < nkpts; ++Q) 1222 { // momentum conservation index 1223 if (Qmap[Q] < 0) 1224 continue; 1225 int nc0 = Q2vbias[Q]; 1226 for (int K = 0; K < nkpts; ++K) 1227 { // K is the index of the kpoint pair of (i,k) 1228 if ((nqk++) % comm->size() == comm->rank()) 1229 { 1230 int nchol = ncholpQ[Q]; 1231 int QK = QKToK2[Q][K]; 1232 int ni = nopk[K]; 1233 int ni0 = std::accumulate(nopk.begin(), nopk.begin() + K, 0); 1234 int nk = nopk[QK]; 1235 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QK, 0); 1236 1237 1238 // v[nw][i(in K)][k(in Q(K))] += sum_n LQK[i][k][n] X[Q][n+][nw] 1239 if (Q <= kminus[Q]) 1240 { 1241 SpMatrix_ref vik(TMats.origin(), {nwalk, ni * nk}); 1242 Sp3Tensor_ref vik3D(TMats.origin(), {nwalk, ni, nk}); 1243 SpMatrix_ref Likn(to_address(LQKikn[Q][K].origin()), {ni * nk, nchol}); 1244 ma::product(T(X.sliced(nc0, nc0 + nchol)), T(Likn), vik); 1245 for (int nw = 0; nw < nwalk; nw++) 1246 for (int i = 0; i < ni; i++) 1247 ma::axpy(one, vik3D[nw][i], v3D[nw][ni0 + i].sliced(nk0, nk0 + nk)); 1248 } 1249 else 1250 { // use L(-Q)(Kk)* 1251 SpMatrix_ref vki(TMats.origin(), {nwalk, nk * ni}); 1252 Sp3Tensor_ref vki3D(TMats.origin(), {nwalk, nk, ni}); 1253 SpMatrix_ref Likn(to_address(LQKikn[kminus[Q]][QK].origin()), {nk * ni, nchol}); 1254 ma::product(T(X.sliced(nc0, nc0 + nchol)), H(Likn), vki); 1255 for (int nw = 0; nw < nwalk; nw++) 1256 { 1257 const auto&& vki_n(vki3D[nw]); 1258 for (int i = 0; i < ni; i++) 1259 { 1260 auto v3D_ni(to_address(v3D[nw][ni0 + i].origin()) + nk0); 1261 for (int k = 0; k < nk; k++, ++v3D_ni) 1262 *v3D_ni += static_cast<vType>(vki_n[k][i]); 1263 } 1264 } 1265 } 1266 } 1267 } 1268 } 1269 comm->barrier(); 1270 // adding second half when Q==(-Q) 1271 nqk = 0; 1272 for (int Q = 0; Q < nkpts; ++Q) 1273 { 1274 if (Qmap[Q] > 0) 1275 { // rho(Q)^+ term 1276 int nc0 = Q2vbias[Q]; 1277 for (int K = 0; K < nkpts; ++K) 1278 { // K is the index of the kpoint pair of (i,k) 1279 if ((nqk++) % comm->size() == comm->rank()) 1280 { 1281 int nchol = ncholpQ[Q]; 1282 int ni = nopk[K]; 1283 int ni0 = std::accumulate(nopk.begin(), nopk.begin() + K, 0); 1284 int nk = nopk[QKToK2[Q][K]]; 1285 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); 1286 SpMatrix_ref Likn(to_address(LQKikn[Q][K].origin()), {ni * nk, nchol}); 1287 SpMatrix_ref vik(TMats.origin(), {nwalk, ni * nk}); 1288 Sp3Tensor_ref vik3D(TMats.origin(), {nwalk, ni, nk}); 1289 // v[nw][k(in Q(K))][i(in K)] += sum_n conj(LQK[i][k][n]) X[Q][n-][nw] 1290 ma::product(T(X.sliced(nc0 + nchol, nc0 + 2 * nchol)), H(Likn), vik); 1291 for (int nw = 0; nw < nwalk; nw++) 1292 { 1293 const auto&& vik3D_n = vik3D[nw]; 1294 for (int k = 0; k < nk; k++) 1295 { 1296 auto v3D_nk = to_address(v3D[nw][nk0 + k].origin()) + ni0; 1297 for (int i = 0; i < ni; i++, ++v3D_nk) 1298 *v3D_nk += static_cast<vType>(vik3D_n[i][k]); 1299 } 1300 } 1301 } 1302 } 1303 } 1304 } 1305 // do I need to "rotate" back, can be done if necessary 1306 comm->barrier(); 1307 } 1308 1309 template<class MatA, 1310 class MatB, 1311 typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 1)>, 1312 typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 1)>, 1313 typename = void> 1314 void vbias(const MatA& G, MatB&& v, double a = 1., double c = 0., int k = 0) 1315 { 1316 using BType = typename std::decay<MatB>::type::element; 1317 using AType = typename std::decay<MatA>::type::element; 1318 boost::multi::array_ref<BType, 2> v_(to_address(v.origin()), {v.size(0), 1}); 1319 boost::multi::array_cref<AType, 2> G_(to_address(G.origin()), {G.size(0), 1}); 1320 return vbias(G_, v_, a, c, k); 1321 } 1322 1323 template<class MatA, 1324 class MatB, 1325 typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 2)>, 1326 typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 2)>> 1327 void vbias(const MatA& Gw, MatB&& v, double a = 1., double c = 0., int nd = 0) 1328 { 1329 using std::copy_n; 1330 using GType = typename std::decay_t<typename MatA::element>; 1331 using vType = typename std::decay<MatB>::type::element; 1332 int nkpts = nopk.size(); 1333 assert(nd >= 0 && nd < nelpk.size()); 1334 int nwalk = Gw.size(1); 1335 assert(v.size(0) == 2 * local_nCV); 1336 assert(v.size(1) == nwalk); 1337 int nspin = (walker_type == COLLINEAR ? 2 : 1); 1338 int npol = (walker_type == NONCOLLINEAR ? 2 : 1); 1339 int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); 1340 int nmo_max = *std::max_element(nopk.begin(), nopk.end()); 1341 int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); 1342 int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts); 1343 int noccb_max = nocca_max; 1344 int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end()); 1345 int noccb_tot = 0; 1346 if (walker_type == COLLINEAR) 1347 { 1348 noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0); 1349 noccb_max = *std::max_element(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts); 1350 } 1351 RealType scl = (walker_type == CLOSED ? 2.0 : 1.0); 1352 SPComplexType one(1.0, 0.0); 1353 SPComplexType halfa(0.5 * a * scl, 0.0); 1354 SPComplexType minusimhalfa(0.0, -0.5 * a * scl); 1355 SPComplexType imhalfa(0.0, 0.5 * a * scl); 1356 size_t local_memory_needs = 2 * nchol_max * nwalk; 1357 if (walker_type == NONCOLLINEAR) 1358 local_memory_needs += nmo_max * npol * nwalk; // for transposed G 1359 if (TMats.num_elements() < local_memory_needs) 1360 TMats.reextent({local_memory_needs, 1}); 1361 SpMatrix_ref vlocal(TMats.origin(), {2 * nchol_max, nwalk}); 1362 std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0)); 1363 1364 assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); 1365 const_sp_pointer Gptr(nullptr); 1366 // I WANT C++17!!!!!! 1367 if (std::is_same<SPComplexType, GType>::value) 1368 { 1369 Gptr = reinterpret_cast<const_sp_pointer>(to_address(Gw.origin())); 1370 } 1371 else 1372 { 1373 size_t mem_needs = Gw.num_elements(); 1374 set_shm_buffer(mem_needs); 1375 { 1376 size_t i0, iN; 1377 std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(Gw.num_elements()), size_t(comm->size())); 1378 copy_n_cast(to_address(Gw.origin()) + i0, iN - i0, to_address(SM_TMats.origin()) + i0); 1379 } 1380 Gptr = to_address(SM_TMats.origin()); 1381 comm->barrier(); 1382 } 1383 1384 boost::multi::array_cref<SPComplexType, 4> G3Da(Gptr, {nocca_tot, npol, nmo_tot, nwalk}); 1385 boost::multi::array_cref<SPComplexType, 3> G3Db(Gptr + G3Da.num_elements() * (nspin - 1), 1386 {noccb_tot, nmo_tot, nwalk}); 1387 1388 { 1389 size_t i0, iN; 1390 std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(v.size(0)), size_t(comm->size())); 1391 for (size_t i = i0; i < iN; ++i) 1392 ma::scal(c, v[i]); 1393 } 1394 comm->barrier(); 1395 1396 size_t nqk = 0; 1397 for (int K = 0; K < nkpts; ++K) 1398 { // K is the index of the kpoint pair of (a,k) 1399 for (int Q = 0; Q < nkpts; ++Q) 1400 { // momentum conservation index 1401 if (Qmap[Q] < 0) 1402 continue; 1403 bool haveV = false; 1404 if ((nqk++) % comm->size() == comm->rank()) 1405 { 1406 haveV = true; 1407 int nchol = ncholpQ[Q]; 1408 int na = nelpk[nd][K]; 1409 int na0 = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + K, 0); 1410 int nk = nopk[QKToK2[Q][K]]; 1411 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); 1412 auto&& v1 = vlocal({0, nchol}, {0, nwalk}); 1413 1414 if (walker_type == NONCOLLINEAR) 1415 { 1416 Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, npol * nk}); 1417 // v1[Q][n][nw] += sum_K sum_a_p_k LQK[a][n][p][k] G[a][p][k][nw] 1418 for (int a = 0; a < na; ++a) 1419 { 1420 SpMatrix_ref Ga(to_address(vlocal.origin()) + vlocal.num_elements(), {npol * nk, nwalk}); 1421 SpMatrix_ref Ga_(Ga.origin(), {npol, nk * nwalk}); 1422 for (int p = 0; p < npol; p++) 1423 copy_n(to_address(G3Da[na0 + a][p][nk0].origin()), nk * nwalk, Ga_[p].origin()); 1424 ma::product(one, Lank[a], Ga, one, v1); 1425 } 1426 } 1427 else 1428 { 1429 Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, nk}); 1430 1431 // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw] 1432 for (int a = 0; a < na; ++a) 1433 ma::product(one, Lank[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1); 1434 } 1435 } 1436 if (walker_type == COLLINEAR) 1437 { 1438 if ((nqk++) % comm->size() == comm->rank()) 1439 { 1440 haveV = true; 1441 int nchol = ncholpQ[Q]; 1442 int na = nelpk[nd][nkpts + K]; 1443 int na0 = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + nkpts + K, 0); 1444 int nk = nopk[QKToK2[Q][K]]; 1445 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); 1446 auto&& v1 = vlocal({0, nchol}, {0, nwalk}); 1447 1448 Sp3Tensor_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][K].origin()), {na, nchol, nk}); 1449 1450 // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw] 1451 for (int a = 0; a < na; ++a) 1452 ma::product(one, Lank[a], G3Db[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1); 1453 } 1454 } 1455 if (haveV) 1456 { 1457 { 1458 std::lock_guard<shared_mutex> guard(*mutex[Q]); 1459 int nc0 = Q2vbias[Q]; 1460 // v+ = 0.5*a*(v1+v2) 1461 ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0, nc0 + ncholpQ[Q])); 1462 // v- = -0.5*a*i*(v1-v2) 1463 ma::axpy(minusimhalfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0 + ncholpQ[Q], nc0 + 2 * ncholpQ[Q])); 1464 } 1465 int Qm = kminus[Q]; 1466 int nc0 = Q2vbias[Qm]; 1467 if (Qmap[Q] == 0) 1468 { 1469 std::lock_guard<shared_mutex> guard(*mutex[Qm]); 1470 // v+ = 0.5*a*(v1+v2) 1471 ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Qm]), v.sliced(nc0, nc0 + ncholpQ[Qm])); 1472 // v- = -0.5*a*i*(v1-v2) 1473 ma::axpy(imhalfa, vlocal.sliced(0, ncholpQ[Qm]), v.sliced(nc0 + ncholpQ[Qm], nc0 + 2 * ncholpQ[Qm])); 1474 } 1475 } // to release the lock 1476 if (haveV) 1477 std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0)); 1478 } 1479 } 1480 // add second contribution when Q==(-Q) 1481 for (int Q = 0; Q < nkpts; ++Q) 1482 { 1483 if (Qmap[Q] > 0) 1484 { // rho(Q)^+ term 1485 for (int K = 0; K < nkpts; ++K) 1486 { // K is the index of the kpoint pair of (a,k) 1487 bool haveV = false; 1488 if ((nqk++) % comm->size() == comm->rank()) 1489 { 1490 haveV = true; 1491 int nchol = ncholpQ[Q]; 1492 int na = nelpk[nd][K]; 1493 int na0 = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + K, 0); 1494 int nk = nopk[QKToK2[Q][K]]; 1495 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); 1496 auto&& v1 = vlocal({0, nchol}, {0, nwalk}); 1497 1498 if (walker_type == NONCOLLINEAR) 1499 { 1500 Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()), 1501 {na, nchol, npol * nk}); 1502 1503 // v1[Q][n][nw] += sum_K sum_a_s_k LQK[b][n][s][l] G[b][s][l][nw] 1504 for (int a = 0; a < na; ++a) 1505 { 1506 SpMatrix_ref Ga(to_address(vlocal.origin()) + vlocal.num_elements(), {npol * nk, nwalk}); 1507 SpMatrix_ref Ga_(Ga.origin(), {npol, nk * nwalk}); 1508 for (int p = 0; p < npol; p++) 1509 copy_n(to_address(G3Da[na0 + a][p][nk0].origin()), nk * nwalk, Ga_[p].origin()); 1510 ma::product(one, Lbnl[a], Ga, one, v1); 1511 } 1512 } 1513 else 1514 { 1515 Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()), 1516 {na, nchol, nk}); 1517 1518 // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw] 1519 for (int a = 0; a < na; ++a) 1520 ma::product(one, Lbnl[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1); 1521 } 1522 } 1523 if (walker_type == COLLINEAR) 1524 { 1525 if ((nqk++) % comm->size() == comm->rank()) 1526 { 1527 haveV = true; 1528 int nchol = ncholpQ[Q]; 1529 int na = nelpk[nd][nkpts + K]; 1530 int na0 = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + nkpts + K, 0); 1531 int nk = nopk[QKToK2[Q][K]]; 1532 int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); 1533 auto&& v1 = vlocal({0, nchol}, {0, nwalk}); 1534 1535 Sp3Tensor_ref Lbnl(to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()), 1536 {na, nchol, nk}); 1537 1538 // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw] 1539 for (int a = 0; a < na; ++a) 1540 ma::product(one, Lbnl[a], G3Db[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1); 1541 } 1542 } 1543 if (haveV) 1544 { 1545 { 1546 std::lock_guard<shared_mutex> guard(*mutex[Q]); 1547 int nc0 = Q2vbias[Q]; 1548 // v+ = 0.5*a*(v1+v2) 1549 ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0, nc0 + ncholpQ[Q])); 1550 // v- = -0.5*a*i*(v1-v2) 1551 ma::axpy(imhalfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0 + ncholpQ[Q], nc0 + 2 * ncholpQ[Q])); 1552 } 1553 } // to release the lock 1554 if (haveV) 1555 std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0)); 1556 } 1557 } 1558 } 1559 comm->barrier(); 1560 } 1561 1562 template<class Mat, class MatB> 1563 void generalizedFockMatrix(Mat&& G, MatB&& Fp, MatB&& Fm) 1564 { 1565 APP_ABORT(" Error: generalizedFockMatrix not implemented for this hamiltonian.\n"); 1566 } 1567 1568 bool distribution_over_cholesky_vectors() const { return true; } 1569 int number_of_ke_vectors() const { return local_nCV; } 1570 int local_number_of_cholesky_vectors() const { return 2 * local_nCV; } 1571 int global_number_of_cholesky_vectors() const { return global_nCV; } 1572 int global_origin_cholesky_vector() const { return global_origin; } 1573 1574 // transpose=true means G[nwalk][ik], false means G[ik][nwalk] 1575 bool transposed_G_for_vbias() const { return false; } 1576 bool transposed_G_for_E() const { return false; } 1577 // transpose=true means vHS[nwalk][ik], false means vHS[ik][nwalk] 1578 bool transposed_vHS() const { return true; } 1579 1580 bool fast_ph_energy() const { return false; } 1581 1582 boost::multi::array<ComplexType, 2> getHSPotentials() { return boost::multi::array<ComplexType, 2>{}; } 1583 1584 private: 1585 communicator* comm; 1586 1587 WALKER_TYPES walker_type; 1588 1589 int global_origin; 1590 int global_nCV; 1591 int local_nCV; 1592 1593 ValueType E0; 1594 1595 // bare one body hamiltonian 1596 shmC3Tensor H1; 1597 1598 // (potentially half rotated) one body hamiltonian 1599 shmCMatrix haj; 1600 1601 // number of orbitals per k-point 1602 IVector nopk; 1603 1604 // number of cholesky vectors per Q-point 1605 IVector ncholpQ; 1606 1607 // position of (-K) in kp-list for every K 1608 IVector kminus; 1609 1610 // number of electrons per k-point 1611 // nelpk[ndet][nspin*nkpts] 1612 shmIMatrix nelpk; 1613 1614 // maps (Q,K) --> k2 1615 shmIMatrix QKToK2; 1616 1617 //Cholesky Tensor Lik[Q][nk][i][k][n] 1618 std::vector<shmSpMatrix> LQKikn; 1619 1620 // half-tranformed Cholesky tensor 1621 std::vector<shmSpMatrix> LQKank; 1622 1623 // half-tranformed Cholesky tensor 1624 std::vector<shmSpMatrix> LQKbnl; 1625 1626 // Defines behavior over Q vector: 1627 // <0: Ignore (handled by another TG) 1628 // 0: Calculate, without rho^+ contribution 1629 // >0: Calculate, with rho^+ contribution. LQKbln data located at Qmap[Q]-1 1630 IVector Qmap; 1631 IVector Q2vbias; 1632 int number_of_symmetric_Q; 1633 1634 // one-body piece of Hamiltonian factorization 1635 shmC3Tensor vn0; 1636 1637 int nsampleQ; 1638 std::vector<RealType> gQ; 1639 shmIMatrix Qwn; 1640 std::default_random_engine generator; 1641 std::discrete_distribution<int> distribution; 1642 1643 // shared buffer space 1644 // using matrix since there are issues with vectors 1645 shmSpMatrix SM_TMats; 1646 SpMatrix TMats; 1647 1648 std::vector<std::unique_ptr<shared_mutex>> mutex; 1649 1650 // boost::multi::array<ComplexType,3> Qave; 1651 // int cntQave=0; 1652 std::vector<ComplexType> EQ; 1653 // std::default_random_engine generator; 1654 // std::uniform_real_distribution<RealType> distribution(RealType(0.0),Realtype(1.0)); 1655 1656 myTimer Timer; 1657 1658 void set_shm_buffer(size_t N) 1659 { 1660 if (SM_TMats.num_elements() < N) 1661 SM_TMats.reextent({N, 1}); 1662 } 1663 1664 template<class MatA, class MatB> 1665 void GKaKjw_to_GKKwaj(int nd, MatA const& GKaKj, MatB&& GKKaj, int nocca_tot, int noccb_tot, int nmo_tot, int akmax) 1666 { 1667 int nspin = (walker_type == COLLINEAR ? 2 : 1); 1668 int npol = (walker_type == NONCOLLINEAR ? 2 : 1); 1669 int nwalk = GKaKj.size(1); 1670 int nkpts = nopk.size(); 1671 assert(GKaKj.num_elements() == (nocca_tot + noccb_tot) * npol * nmo_tot * nwalk); 1672 assert(GKKaj.num_elements() == nspin * nkpts * nkpts * npol * akmax * nwalk); 1673 boost::multi::array_cref<ComplexType, 4> Gca(to_address(GKaKj.origin()), {nocca_tot, npol, nmo_tot, nwalk}); 1674 boost::multi::array_cref<ComplexType, 3> Gcb(to_address(GKaKj.origin()) + Gca.num_elements(), 1675 {noccb_tot, nmo_tot, nwalk}); 1676 boost::multi::array_ref<SPComplexType, 4> GKK(to_address(GKKaj.origin()), 1677 {nspin, nkpts, nkpts, nwalk * npol * akmax}); 1678 int na0 = 0; 1679 for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++) 1680 { 1681 int na = nelpk[nd][Ka]; 1682 int nj0 = 0; 1683 for (int Kj = 0; Kj < nkpts; Kj++, Kaj++) 1684 { 1685 int nj = nopk[Kj]; 1686 if (Kaj % comm->size() != comm->rank()) 1687 { 1688 nj0 += nj; 1689 continue; 1690 } 1691 auto G_(to_address(GKK[0][Ka][Kj].origin())); 1692 int naj = na * nj * npol; 1693 for (int a = 0, asj = 0; a < na; a++) 1694 { 1695 for (int p = 0; p < npol; p++) 1696 { 1697 auto Gc_(to_address(Gca[na0 + a][p][nj0].origin())); 1698 for (int j = 0; j < nj; j++, asj++) 1699 { 1700 for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) 1701 #ifdef MIXED_PRECISION 1702 G_[waj + asj] = static_cast<SPComplexType>(*Gc_); 1703 #else 1704 G_[waj + asj] = (*Gc_); 1705 #endif 1706 } 1707 } 1708 } 1709 nj0 += nj; 1710 } 1711 na0 += na; 1712 } 1713 if (nspin > 1) 1714 { 1715 na0 = 0; 1716 for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++) 1717 { 1718 int na = nelpk[nd][nkpts + Ka]; 1719 int nj0 = 0; 1720 for (int Kj = 0; Kj < nkpts; Kj++, Kaj++) 1721 { 1722 int nj = nopk[Kj]; 1723 if (Kaj % comm->size() != comm->rank()) 1724 { 1725 nj0 += nj; 1726 continue; 1727 } 1728 auto G_(to_address(GKK[1][Ka][Kj].origin())); 1729 int naj = na * nj; 1730 for (int a = 0, aj = 0; a < na; a++) 1731 { 1732 auto Gc_(to_address(Gcb[na0 + a][nj0].origin())); 1733 for (int j = 0; j < nj; j++, aj++) 1734 { 1735 for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) 1736 #ifdef MIXED_PRECISION 1737 G_[waj + aj] = static_cast<SPComplexType>(*Gc_); 1738 #else 1739 G_[waj + aj] = (*Gc_); 1740 #endif 1741 } 1742 } 1743 nj0 += nj; 1744 } 1745 na0 += na; 1746 } 1747 } 1748 comm->barrier(); 1749 } 1750 1751 // for testing purposes only 1752 template<class MatA, class MatB> 1753 void GwAK_to_GAKw(MatA const& GwAK, MatB&& GAKw) 1754 { 1755 int nwalk = GwAK.size(0); 1756 int nAK = GwAK.size(1); 1757 for (int w = 0; w < nwalk; w++) 1758 for (int AK = 0; AK < nAK; AK++) 1759 GAKw[AK][w] = GwAK[w][AK]; 1760 } 1761 }; 1762 1763 } // namespace afqmc 1764 1765 } // namespace qmcplusplus 1766 1767 #endif 1768