1////////////////////////////////////////////////////////////////////// 2// This file is distributed under the University of Illinois/NCSA Open Source 3// License. See LICENSE file in top directory for details. 4// 5// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. 6// 7// File developed by: 8// Miguel A. Morales, moralessilva2@llnl.gov 9// Lawrence Livermore National Laboratory 10// 11// File created by: 12// Miguel A. Morales, moralessilva2@llnl.gov 13// Lawrence Livermore National Laboratory 14//////////////////////////////////////////////////////////////////////////////// 15 16#include <vector> 17#include <map> 18#include <string> 19#include <iostream> 20#include <tuple> 21#include <mutex> 22 23#include "AFQMC/config.h" 24#include "AFQMC/Numerics/csr_blas.hpp" 25#include "AFQMC/Wavefunctions/PHMSD.hpp" 26 27//#include "AFQMC/Wavefunctions/PHMSD.h" 28 29namespace qmcplusplus 30{ 31namespace afqmc 32{ 33/* 34 * Calculates the local energy and overlaps of all the walkers in the set and 35 * returns them in the appropriate data structures 36 */ 37template<class WlkSet, class Mat, class TVec> 38void PHMSD::Energy_shared(const WlkSet& wset, Mat&& E, TVec&& Ov) 39{ 40 using ma::conj; 41 using std::get; 42 int nspins = 2; //(walker_type==COLLINEAR?2:1); 43 size_t nkev = HamOp.number_of_ke_vectors(); 44 assert(E.dimensionality == 2); 45 assert(Ov.dimensionality == 1); 46 assert(E.size(0) == wset.size()); 47 assert(Ov.size(0) == wset.size()); 48 assert(E.size(1) == 3); 49 50 ComplexType zero(0.0); 51 auto Gsize = dm_size(false); 52 auto nwalk = wset.size(); 53 double LogOverlapFactor(wset.getLogOverlapFactor()); 54 // resize shm structures if needed 55 if (Ovmsd.shape() != std::make_tuple(long(nspins), long(maxn_unique_confg), long(nwalk))) 56 Ovmsd.reextent({nspins, maxn_unique_confg, nwalk}); 57 if (Emsd.shape() != std::make_tuple(long(nspins), long(maxn_unique_confg), long(nwalk), 3l)) 58 Emsd.reextent({nspins, maxn_unique_confg, nwalk, 3}); 59 // resize Alpha here, if beta is needed resize below 60 if (GrefA.shape() != std::make_tuple(nwalk, dm_dims(false, Alpha).first, dm_dims(false, Alpha).second)) 61 GrefA.reextent({nwalk, dm_dims(false, Alpha).first, dm_dims(false, Alpha).second}); 62 if (wgt.size() != nwalk) 63 wgt.reextent(iextensions<1u>{nwalk}); 64 if (opSpinEJ.size() != nwalk) 65 opSpinEJ.reextent(iextensions<1u>{nwalk}); 66 if (localGbuff.size() < 2 * Gsize) 67 localGbuff.reextent(iextensions<1u>{2 * Gsize}); 68 if (eloc2.size(0) != nwalk || eloc2.size(1) != 3) 69 eloc2.reextent({nwalk, 3}); 70 71 std::fill_n(Ov.origin(), nwalk, zero); 72 std::fill_n(E.origin(), 3 * nwalk, zero); 73 std::fill_n(opSpinEJ.origin(), nwalk, ComplexType(0.0)); 74 75 // dummy: ugly but not sure how to do it better 76 SPCMatrix* dummyMatPtr(nullptr); 77 78 auto refc = abij.reference_configuration(); 79 std::vector<int> confg(NAEA); 80 auto confgs = abij.configurations_begin(); 81 82 if (walker_type != COLLINEAR) 83 APP_ABORT("Error: Finish implementation of PHMSD for CLOSED/NONCOLLINEAR walkers.\n"); 84 85 // FIX FIX FIX: Incorrect if walker_type==CLOSED 86 // 1. calculate eneries for unique determinants 87 // - Emsd[spin][nd_unique][iw][{0:E1, 1:EXX, 2:--}] 88 // - Ovmsd[spin][nd_unique][iw] 89 if (fast_ph_energy) 90 { 91 // assume [nwalk][ik] for now. If needed, generalize later 92 if (GrefB.shape() != std::make_tuple(nwalk, dm_dims(false, Beta).first, dm_dims(false, Beta).second)) 93 GrefB.reextent({nwalk, dm_dims(false, Beta).first, dm_dims(false, Beta).second}); 94 if (QQ0A.shape() != std::make_tuple(nwalk, long(OrbMats[0].size(0)), NAEA)) 95 QQ0A.reextent({nwalk, OrbMats[0].size(0), NAEA}); 96 if (QQ0B.shape() != std::make_tuple(nwalk, long(OrbMats.back().size(0)), NAEB)) 97 QQ0B.reextent({nwalk, OrbMats.back().size(0), NAEB}); 98 99 for (int iw = 0, nc = 0; iw < nwalk; iw++) 100 { 101 if (nc % TG.TG_local().size() == TG.TG_local().rank()) 102 { 103 boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), 104 {dm_dims_ref(false, Alpha).first, 105 dm_dims_ref(false, Alpha).second}); 106 Ovmsd[0][0][iw] = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), G2D_, 107 LogOverlapFactor, refc, QQ0A[iw], true); 108 confg.resize(NAEA); 109 abij.get_configuration(0, 0, confg); 110 auto Gr = GrefA[iw]; 111 std::fill_n(Gr.origin(), Gr.num_elements(), ComplexType(0.0)); 112 for (int k = 0; k < confg.size(); ++k) 113 Gr[confg[k]] = G2D_[k]; 114 } 115 ++nc; 116 if (nc % TG.TG_local().size() == TG.TG_local().rank()) 117 { 118 boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), 119 {dm_dims_ref(false, Beta).first, dm_dims_ref(false, Beta).second}); 120 Ovmsd[1][0][iw] = 121 SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(), 122 *wset[iw].SlaterMatrix(SpinTypes((walker_type == CLOSED) ? 0 : 1)), 123 G2D_, LogOverlapFactor, refc, QQ0B[iw], true); 124 confg.resize(NAEB); 125 abij.get_configuration(1, 0, confg); 126 auto Gr = GrefB[iw]; 127 std::fill_n(Gr.origin(), Gr.num_elements(), ComplexType(0.0)); 128 for (int k = 0; k < confg.size(); ++k) 129 Gr[confg[k]] = G2D_[k]; 130 } 131 ++nc; 132 } 133 TG.local_barrier(); 134 HamOp.fast_energy(Emsd, Ovmsd, GrefA, GrefB, QQ0A, QQ0B, Qwork, abij, det_couplings); 135 } 136 else 137 { 138 ComplexType ov0; 139 if (KEright.shape() != std::make_tuple(long(abij.number_of_unique_excitations()[0]), long(nwalk), long(nkev))) 140 KEright.reextent({abij.number_of_unique_excitations()[0], nwalk, nkev}); 141 if (KEleft.shape() != std::make_tuple(long(nwalk), long(nkev))) 142 KEleft.reextent({nwalk, nkev}); 143 for (int spin = 0; spin < nspins; ++spin) 144 { 145 int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0; 146 int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin; 147 confg.resize((spin == 0) ? NAEA : NAEB); 148 auto Gdims = dm_dims(false, SpinTypes(spin)); 149 auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin)); 150 boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), {Gdims_ref.first, Gdims_ref.second}); 151 int nr = Gdims.first * Gdims.second, nc = nwalk; 152 if (transposed_G_for_E_) 153 std::swap(nr, nc); 154 // GrefA is guaranteed to have enough space 155 boost::multi::array_ref<ComplexType, 2> G(to_address(GrefA.origin()), {nr, nc}); 156 for (int nd = 0; nd < det_couplings[spin].size(); ++nd) 157 { 158 abij.get_configuration(spin, nd, confg); 159 // keeping this simple for now! 160 for (int iw = 0; iw < nwalk; iw++) 161 { 162 if (iw % TG.TG_local().size() == TG.TG_local().rank()) 163 { 164 Ovmsd[spin][nd][iw] = 165 SDetOp.MixedDensityMatrixFromConfiguration(OrbMats[orb_spin_indx], 166 *wset[iw].SlaterMatrix(SpinTypes(wlk_spin_indx)), G2D_, 167 LogOverlapFactor, confg.data(), true); 168 if (transposed_G_for_E_) 169 { 170 boost::multi::array_ref<ComplexType, 3> G3D(to_address(G.origin()), {nwalk, Gdims.first, Gdims.second}); 171 std::fill_n(G[iw].origin(), Gdims.first * Gdims.second, ComplexType(0.0)); 172 for (int k = 0; k < confg.size(); ++k) 173 G3D[iw][confg[k]] = G2D_[k]; 174 } 175 else 176 { 177 boost::multi::array_ref<ComplexType, 3> G3D(to_address(G.origin()), {Gdims.first, Gdims.second, nwalk}); 178 for (int k = 0; k < Gdims.first; ++k) 179 for (int j = 0; j < Gdims.second; ++j) 180 G3D[k][j][iw] = ComplexType(0.0); 181 for (int k = 0; k < confg.size(); ++k) 182 G3D[confg[k]]({0, Gdims.second}, iw) = G2D_[k]; 183 } 184 } 185 } 186 TG.local_barrier(); 187 if (spin == 0) 188 { 189 auto KEr = KEright[nd]; 190 HamOp.energy(eloc2, G, orb_spin_indx, dummyMatPtr, std::addressof(KEr), TG.TG_local().root(), true, true); 191 // reduce_n since Emsd is in shared memory (doing this instead of copy with mutex) 192 TG.TG_local().reduce_n(to_address(eloc2.origin()), 3 * nwalk, to_address(Emsd[spin][nd].origin()), 193 std::plus<>(), 0); 194 //std::cout<<" E: " <<spin <<" " <<nd <<" " <<Ovmsd[spin][nd][0] <<" " 195 //<<eloc2[0][0] <<" " 196 //<<eloc2[0][1] <<" " 197 //<<eloc2[0][2] <<std::endl; 198 TG.local_barrier(); 199 } 200 else 201 { 202 HamOp.energy(eloc2, G, orb_spin_indx, std::addressof(KEleft), dummyMatPtr, TG.TG_local().root(), true, true); 203 //std::cout<<" E: " <<spin <<" " <<nd <<" " <<Ovmsd[spin][nd][0] <<" " 204 //<<eloc2[0][0] <<" " 205 //<<eloc2[0][1] <<" " 206 //<<eloc2[0][2] <<std::endl; 207 // reduce_n since Emsd is in shared memory (doing this instead of copy with mutex) 208 TG.TG_local().reduce_n(to_address(eloc2.origin()), 3 * nwalk, to_address(Emsd[spin][nd].origin()), 209 std::plus<>(), 0); 210 TG.local_barrier(); 211 // round-robin KE contributions 212 // iterators to configurations containing this beta configuration 213 auto it = to_address(det_couplings[1].values()) + (*det_couplings[1].pointers_begin(nd)); 214 auto ite = to_address(det_couplings[1].values()) + (*det_couplings[1].pointers_end(nd)); 215 int nt = 0; 216 for (; it < ite; ++it) 217 { 218 for (int iw = 0; iw < nwalk; ++iw, ++nt) 219 { 220 if (nt % TG.TG_local().size() == TG.TG_local().rank()) 221 { 222 size_t nd_alp = get<0>(*(confgs + (*it))); 223 auto w_ = ma::conj(get<2>(*(confgs + (*it)))) * Ovmsd[0][nd_alp][iw] * Ovmsd[1][nd][iw]; 224 opSpinEJ[iw] += w_ * static_cast<ComplexType>(ma::dot(KEleft[iw], KEright[nd_alp][iw])); 225 } 226 } 227 } 228 } 229 } 230 } 231 } 232 TG.local_barrier(); 233 234 // 2. assemble sum over configurations 235 int nc = 0; 236 for (int spin = 0; spin < nspins; ++spin) 237 { 238 for (int nd = 0; nd < det_couplings[spin].size(); ++nd, ++nc) 239 { 240 if (nc % TG.TG_local().size() == TG.TG_local().rank()) 241 { 242 auto it = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_begin(nd)); 243 auto ite = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_end(nd)); 244 std::fill_n(wgt.origin(), nwalk, ComplexType(0.0)); 245 if (spin == 0) 246 { 247 for (; it < ite; ++it) 248 { 249 auto ci = ma::conj(get<2>(*(confgs + (*it)))); 250 auto Ovmsd_ = Ovmsd[1][get<1>(*(confgs + (*it)))]; 251 for (int iw = 0; iw < nwalk; ++iw) 252 { 253 wgt[iw] += ci * Ovmsd_[iw]; 254 } 255 } 256 for (int iw = 0; iw < nwalk; ++iw) 257 { 258 wgt[iw] *= Ovmsd[0][nd][iw]; 259 Ov[iw] += wgt[iw]; 260 } 261 } 262 else 263 { 264 for (; it < ite; ++it) 265 { 266 auto ci = ma::conj(get<2>(*(confgs + (*it)))); 267 auto Ovmsd_ = Ovmsd[0][get<0>(*(confgs + (*it)))]; 268 for (int iw = 0; iw < nwalk; ++iw) 269 wgt[iw] += ci * Ovmsd_[iw]; 270 } 271 for (int iw = 0; iw < nwalk; ++iw) 272 wgt[iw] *= Ovmsd[1][nd][iw]; 273 } 274 //Emsd[spin][nd_unique][iw][{0:E1, 1:EXX, 2:EJ}] 275 for (int iw = 0; iw < nwalk; ++iw) 276 { // remove scaling from CLOSED shell energy evaluation 277 E[iw][0] += wgt[iw] * Emsd[spin][nd][iw][0] / 2.0; 278 E[iw][1] += wgt[iw] * Emsd[spin][nd][iw][1] / 2.0; 279 E[iw][2] += wgt[iw] * Emsd[spin][nd][iw][2] / 4.0; 280 } 281 } //nd%TG.TG_local().size()==TG.TG_local().rank() 282 } //nex 283 } // spin 284 285 // 3. reduce over TG_local 286 TG.TG_local().all_reduce_in_place_n(to_address(opSpinEJ.origin()), nwalk, std::plus<>()); 287 TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nwalk, std::plus<>()); 288 TG.TG_local().all_reduce_in_place_n(to_address(E.origin()), 3 * nwalk, std::plus<>()); 289 for (int i = 0; i < nwalk; ++i) 290 { 291 E[i][0] /= Ov[i]; 292 E[i][1] /= Ov[i]; 293 E[i][2] = (E[i][2] + opSpinEJ[i]) / Ov[i]; 294 } 295 TG.local_barrier(); 296} 297 298/* 299 * Calculates the local energy and overlaps of all the walkers in the set and 300 * returns them in the appropriate data structures 301 */ 302template<class WlkSet, class Mat, class TVec> 303void PHMSD::Energy_distributed(const WlkSet& wset, Mat&& E, TVec&& Ov) 304{ 305 APP_ABORT(" Error: Finish PHMSD::Energy_distributed. \n"); 306 /* 307 //1. Calculate G and overlaps 308 //2. Loop over nodes in TG 309 // 2.a isend G to next node. irecv next G from "previous" node 310 // 2.b add local contribution to current G 311 // 2.c wait for comms to finish 312 //3. all reduce resulting energies 313 314 assert(ci.size()==1); 315 bool new_shm_space=false; 316 const int node_number = TG.getLocalGroupNumber(); 317 const int nnodes = TG.getNGroupsPerTG(); 318 const int Gsize = dm_size(false); 319 const ComplexType zero(0.0); 320 const int nwalk = wset.size(); 321 // allocte space in shared memory for: 322 // i. 2 copies of G (always compact), 323 // ii. ovlps for local walkers 324 // iii. energies[3] for all walkers on all nodes of TG (assume all nodes have same # of walkers) 325 int nt = nwalk*(2*Gsize+1); 326 if(not shmbuff_for_E) { 327 shmbuff_for_E = std::make_unique<SHM_Buffer>(TG.TG_local(),nt); 328 new_shm_space=true; 329 } 330 // in case the number of walkers changes 331 if(shmbuff_for_E->num_elements() < nt) { 332 shmbuff_for_E = std::make_unique<SHM_Buffer>(TG.TG_local(),nt); 333 new_shm_space=true; 334 } 335 assert(shmbuff_for_E->num_elements() >= nt); 336 assert(E.dimensionality==2); 337 assert(Ov.dimensionality==1); 338 assert(E.size(0)==wset.size()); 339 assert(Ov.size(0)==wset.size()); 340 assert(E.size(1)==3); 341 342 int nr=Gsize,nc=nwalk; 343 if(transposed_G_for_E_) std::swap(nr,nc); 344 int displ=0; 345 boost::multi::array_ref<ComplexType,2> Gwork(to_address(shmbuff_for_E->origin()), 346 {nr,nc}); 347 displ += Gsize*nwalk; 348 boost::multi::array_ref<ComplexType,2> Grecv(to_address(shmbuff_for_E->origin())+displ, 349 {nr,nc}); 350 displ += Gsize*nwalk; 351 boost::multi::array_ref<ComplexType,1> overlaps(to_address(shmbuff_for_E->origin())+displ, 352 iextensions<1u>{nwalk}); 353 if(eloc2.size(0) != nnodes*nwalk || eloc2.size(1) != 3) 354 eloc2.resize({nnodes*nwalk,3}); 355 auto elocal = eloc2[node_number*nwalk]; 356 int nak0,nak1; 357 std::tie(nak0,nak1) = FairDivideBoundary(TG.getLocalTGRank(),Gsize*nwalk,TG.getNCoresPerTG()); 358 359 if(new_shm_space) { 360 // use mpi3 when ready 361 if(req_Grecv!=MPI_REQUEST_NULL) 362 MPI_Request_free(&req_Grecv); 363 if(req_Gsend!=MPI_REQUEST_NULL) 364 MPI_Request_free(&req_Gsend); 365 MPI_Send_init(Gwork.origin()+nak0,(nak1-nak0)*sizeof(ComplexType),MPI_CHAR, 366 TG.prev_core(),1234,TG.TG().impl_,&req_Gsend); 367 MPI_Recv_init(Grecv.origin()+nak0,(nak1-nak0)*sizeof(ComplexType),MPI_CHAR, 368 TG.next_core(),1234,TG.TG().impl_,&req_Grecv); 369 } 370 371 std::fill_n(eloc2.origin(),3*nnodes*nwalk,ComplexType(0.0)); 372 TG.local_barrier(); 373 374 MPI_Status st; 375 376 // calculate G for local walkers 377 MixedDensityMatrix_for_E(wset,Gwork,overlaps,0); 378 379 for(int k=0; k<nnodes; k++) { 380 381 // wait for G from node behind you, copy to Gwork 382 if(k>0) { 383 MPI_Wait(&req_Grecv,&st); 384 MPI_Wait(&req_Gsend,&st); // need to wait for Gsend in order to overwrite Gwork 385 std::copy_n(Grecv.origin()+nak0,(nak1-nak0),Gwork.origin()+nak0); 386 TG.local_barrier(); 387 } 388 389 // post send/recv messages with nodes ahead and behind you 390 if(k < nnodes-1) { 391 MPI_Start(&req_Gsend); 392 MPI_Start(&req_Grecv); 393 } 394 395 // calculate your contribution of the local enery to the set of walkers in Gwork 396 int q = (k+node_number)%nnodes; 397 HamOp.energy(eloc2.sliced(q*nwalk,(q+1)*nwalk), 398 Gwork,0,TG.TG_local().root() && k==0); 399 TG.local_barrier(); 400 401 } 402 TG.TG().all_reduce_in_place_n(eloc2.origin(),3*nnodes*nwalk,std::plus<>()); 403 TG.local_barrier(); 404 std::copy_n(elocal.origin(),3*nwalk,E.origin()); 405 std::copy_n(overlaps.origin(),nwalk,Ov.origin()); 406 TG.local_barrier(); 407*/ 408} 409 410/* 411 * This routine has (potentially) considerable overhead if either the number of determinants 412 * or the number of walkers changes. 413 * G is assumed to be in shared memory 414 * Ov is assumed to be local to the core 415 */ 416template<class WlkSet, class MatG, class TVec> 417void PHMSD::MixedDensityMatrix(const WlkSet& wset, MatG&& G, TVec&& Ov, bool compact, bool transpose) 418{ 419 // if not compact, calculate compact on temporary storage and multiply by OrbMat[] on the left at the end. 420 using ma::T; 421 assert(G.stride(1) == 1); 422 assert(Ov.stride(0) == 1); 423 if (transpose) 424 assert(G.size(0) == wset.size() && G.size(1) == size_t(dm_size(not compact))); 425 else 426 assert(G.size(1) == wset.size() && G.size(0) == size_t(dm_size(not compact))); 427 const int nw = wset.size(); 428 auto refc = abij.reference_configuration(); 429 double LogOverlapFactor(wset.getLogOverlapFactor()); 430 assert(Ov.size() >= nw); 431 std::fill_n(Ov.begin(), nw, 0); 432 for (int i = 0; i < G.size(0); i++) 433 if (i % TG.TG_local().size() == TG.TG_local().rank()) 434 std::fill_n(G[i].origin(), G.size(1), ComplexType(0.0)); 435 TG.local_barrier(); 436 auto Gsize = size_t(dm_size(not compact)); 437 if (compact) 438 { 439 if (localGbuff.size() < 2 * Gsize) // 2 copies needed 440 localGbuff.reextent(iextensions<1u>{2 * Gsize}); 441 } 442 else 443 { 444 if (localGbuff.size() < 3 * Gsize) // 3 copies needed 445 localGbuff.reextent(iextensions<1u>{3 * Gsize}); 446 } 447 if (walker_type != COLLINEAR) 448 { 449 APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for CLOSED and NONCOLLINEAR. \n"); 450 } 451 else 452 { 453 // always calculate compact and multiply by OrbMat at the end if full 454 auto Gsize_c = size_t(dm_size(false)); 455 auto GAdims = dm_dims(false, Alpha); 456 auto GBdims = dm_dims(false, Beta); 457 auto GAdims_full = dm_dims(true, Alpha); 458 auto GBdims_full = dm_dims(true, Beta); 459 if (compact) 460 { 461 GAdims_full = {0, 0}; 462 GBdims_full = {0, 0}; 463 } 464 auto GAdims0 = dm_dims_ref(false, Alpha); 465 auto GBdims0 = dm_dims_ref(false, Beta); 466 size_t cnt = 0; 467 // REDUCE ALL THIS TEMPORARY STORAGE!!! 468 // storage for reference Green functions 469 boost::multi::array_ref<ComplexType, 2> GA2D0_(localGbuff.origin(), {GAdims0.first, GAdims0.second}); 470 cnt += GA2D0_.num_elements(); 471 boost::multi::array_ref<ComplexType, 2> GB2D0_(localGbuff.origin() + cnt, {GBdims0.first, GBdims0.second}); 472 cnt += GB2D0_.num_elements(); 473 // storage for Gw in case need to transpose result at the end 474 boost::multi::array_ref<ComplexType, 2> GA2D_(localGbuff.origin() + cnt, {GAdims.first, GAdims.second}); 475 cnt += GA2D_.num_elements(); 476 boost::multi::array_ref<ComplexType, 2> GB2D_(localGbuff.origin() + cnt, {GBdims.first, GBdims.second}); 477 cnt += GB2D_.num_elements(); 478 boost::multi::array_ref<ComplexType, 1> GA1D_(GA2D_.origin(), iextensions<1u>{GAdims.first * GAdims.second}); 479 boost::multi::array_ref<ComplexType, 1> GB1D_(GB2D_.origin(), iextensions<1u>{GBdims.first * GBdims.second}); 480 // storage for full G in case compact=false 481 boost::multi::array_ref<ComplexType, 2> Gfulla(localGbuff.origin() + cnt, {GAdims_full.first, GAdims_full.second}); 482 cnt += Gfulla.num_elements(); 483 boost::multi::array_ref<ComplexType, 2> Gfullb(localGbuff.origin() + cnt, {GBdims_full.first, GBdims_full.second}); 484 cnt += Gfullb.num_elements(); 485 486 const int ntasks_percore = nw / TG.getNCoresPerTG(); 487 const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG(); 488 const int nextra = nw - ntasks_total_serial; 489 490 // each processor does ntasks_percore_serial overlaps serially 491 const int w0 = TG.getLocalTGRank() * ntasks_percore; 492 const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore; 493 494 // task_w_d = = wlk_w*ndet + d 495 local_ov[0][0] = 1.0; 496 local_ov[1][0] = 1.0; 497 for (int iw = w0; iw < wN; ++iw) 498 { 499 // 1. calculate list of overlaps 500 ComplexType ov0 = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), GA2D0_, 501 LogOverlapFactor, refc, local_QQ0inv0, true); 502 calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]); 503 ov0 *= SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), GB2D0_, 504 LogOverlapFactor, refc + NAEA, local_QQ0inv1, true); 505 calculate_overlaps(0, 1, 1, abij, local_QQ0inv1, Qwork, local_ov[1]); 506 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it) 507 Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] * local_ov[1][std::get<1>(*it)]; 508 509 // 2. generate R[Nact,Nel] and generate G 510 boost::multi::array_ref<ComplexType, 2> Ra(Gwork.origin(), {NAEA, long(OrbMats[0].size(0))}); 511 calculate_R(0, 1, 0, abij, det_couplings[0], local_QQ0inv0, Qwork, local_ov[1], ov0, Ra); 512 if (transpose) 513 { 514 if (compact) 515 { 516 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), {GAdims.first, GAdims.second}); 517 ma::product(T(Ra), GA2D0_, Gw); 518 } 519 else 520 { 521 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), 522 {GAdims_full.first, GAdims_full.second}); 523 ma::product(T(Ra), GA2D0_, GA2D_); 524 ma::product(T(OrbMats[0]), GA2D_, Gw); 525 } 526 } 527 else 528 { 529 if (compact) 530 { 531 ma::product(T(Ra), GA2D0_, GA2D_); 532 //G({0,GAdims.first*GAdims.second},iw) = GA1D_; 533 ma::copy(GA1D_, G({0, GAdims.first * GAdims.second}, iw)); 534 } 535 else 536 { 537 boost::multi::array_ref<ComplexType, 1> G1D(Gfulla.origin(), iextensions<1u>{long(Gfulla.num_elements())}); 538 ma::product(T(Ra), GA2D0_, GA2D_); 539 ma::product(T(OrbMats[0]), GA2D_, Gfulla); 540 ma::copy(G1D, G({0, Gfulla.num_elements()}, iw)); 541 //G({0,Gfulla.num_elements()},iw) = G1D; 542 } 543 } 544 545 boost::multi::array_ref<ComplexType, 2> Rb(Gwork.origin(), {NAEB, long(OrbMats.back().size(0))}); 546 calculate_R(0, 1, 1, abij, det_couplings[1], local_QQ0inv1, Qwork, local_ov[0], ov0, Rb); 547 if (transpose) 548 { 549 if (compact) 550 { 551 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + GAdims.first * GAdims.second, 552 {GBdims.first, GBdims.second}); 553 ma::product(T(Rb), GB2D0_, Gw); 554 } 555 else 556 { 557 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + 558 GAdims_full.first * GAdims_full.second, 559 {GBdims_full.first, GBdims_full.second}); 560 ma::product(T(Rb), GB2D0_, GB2D_); 561 ma::product(T(OrbMats.back()), GB2D_, Gw); 562 } 563 } 564 else 565 { 566 if (compact) 567 { 568 ma::product(T(Rb), GB2D0_, GB2D_); 569 //G({GAdims.first*GAdims.second,G.size(0)},iw) = GB1D_; 570 ma::copy(GB1D_, G({GAdims.first * GAdims.second, G.size(0)}, iw)); 571 } 572 else 573 { 574 boost::multi::array_ref<ComplexType, 1> G1D(Gfullb.origin(), iextensions<1u>{Gfullb.num_elements()}); 575 ma::product(T(Rb), GB2D0_, GB2D_); 576 ma::product(T(OrbMats.back()), GB2D_, Gfullb); 577 //G({Gfulla.num_elements(),G.size(0)},iw) = G1D; 578 ma::copy(G1D, G({Gfulla.num_elements(), G.size(0)}, iw)); 579 } 580 } 581 } 582 // all remaining overlaps are performed in parallel with blocks of cores 583 // partition processors in nextra groups 584 if (nextra > 0) 585 { 586 // check if new communicator is necessary 587 if (last_number_extra_tasks != nextra) 588 { 589 last_number_extra_tasks = nextra; 590 for (int n = 0; n < nextra; n++) 591 { 592 int n0, n1; 593 std::tie(n0, n1) = FairDivideBoundary(n, TG.getNCoresPerTG(), nextra); 594 if (TG.getLocalTGRank() >= n0 && TG.getLocalTGRank() < n1) 595 { 596 last_task_index = n; 597 break; 598 } 599 } 600 // first setup 601 local_group_comm = shared_communicator(TG.TG_local().split(last_task_index, TG.TG_local().size())); 602 // this probably does not work!!! 603 { 604 shmCMatrix unique_overlaps_({2, maxn_unique_confg}, shared_allocator<ComplexType>{local_group_comm}); 605 unique_overlaps.swap(unique_overlaps_); 606 } 607 { 608 shmCMatrix QQ0inv0_({OrbMats[0].size(0), NAEA}, shared_allocator<ComplexType>{local_group_comm}); 609 QQ0inv0.swap(QQ0inv0_); 610 } 611 { 612 shmCMatrix QQ0inv1_({OrbMats.back().size(0), NAEB}, shared_allocator<ComplexType>{local_group_comm}); 613 QQ0inv1.swap(QQ0inv1_); 614 } 615 { 616 shmCMatrix GA2D0_shm_({GAdims0.first, GAdims0.second}, shared_allocator<ComplexType>{local_group_comm}); 617 GA2D0_shm.swap(GA2D0_shm_); 618 } 619 { 620 shmCMatrix GB2D0_shm_({GBdims0.first, GBdims0.second}, shared_allocator<ComplexType>{local_group_comm}); 621 GB2D0_shm.swap(GB2D0_shm_); 622 } 623 } 624 if (last_task_index < 0 || last_task_index > nextra) 625 APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)"); 626 { 627 if (local_group_comm.rank() == 0) 628 unique_overlaps[0][0] = 1.0; 629 if (local_group_comm.rank() == 0) 630 unique_overlaps[1][0] = 1.0; 631 local_group_comm.barrier(); 632 633 int M0, Mn, sz = GAdims.second; 634 std::tie(M0, Mn) = FairDivideBoundary(local_group_comm.rank(), sz, local_group_comm.size()); 635 int iw = (last_task_index + ntasks_total_serial); 636 ComplexType ov0 = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), GA2D0_shm, 637 LogOverlapFactor, refc, QQ0inv0, local_group_comm, true); 638 calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 0, abij, QQ0inv0, Qwork, 639 unique_overlaps[0]); 640 local_group_comm.barrier(); 641 ov0 *= SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), GB2D0_shm, 642 LogOverlapFactor, refc + NAEA, QQ0inv1, local_group_comm, true); 643 calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 1, abij, QQ0inv1, Qwork, 644 unique_overlaps[1]); 645 local_group_comm.barrier(); 646 size_t ic = 0; 647 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it, ++ic) 648 if (ic % local_group_comm.size() == local_group_comm.rank()) 649 Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * unique_overlaps[0][std::get<0>(*it)] * 650 unique_overlaps[1][std::get<1>(*it)]; 651 652 // 2. generate R[Nact,Nel] and generate G 653 boost::multi::array_ref<ComplexType, 2> Ra(Gwork.origin(), {NAEA, long(OrbMats[0].size(0))}); 654 calculate_R(local_group_comm.rank(), local_group_comm.size(), 0, abij, det_couplings[0], QQ0inv0, Qwork, 655 unique_overlaps[1], ov0, Ra); 656 local_group_comm.all_reduce_in_place_n(to_address(Ra.origin()), Ra.num_elements(), std::plus<>()); 657 if (transpose) 658 { 659 if (compact) 660 { 661 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), {GAdims.first, GAdims.second}); 662 ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}), Gw(Gw.extension(0), {M0, Mn})); 663 } 664 else 665 { 666 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), 667 {GAdims_full.first, GAdims_full.second}); 668 ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}), 669 GA2D_(GA2D_.extension(0), {M0, Mn})); // can be local 670 ma::product(T(OrbMats[0]), GA2D_(GA2D_.extension(0), {M0, Mn}), // can be local 671 Gw(Gw.extension(0), {M0, Mn})); 672 } 673 } 674 else 675 { 676 if (compact) 677 { 678 ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}), 679 GA2D_(GA2D_.extension(0), {M0, Mn})); // can be local 680 boost::multi::array_ref<ComplexType, 3> Gw(to_address(G.origin()), 681 {GAdims.first, GAdims.second, long(G.size(1))}); 682 // copying by hand for now, implement strided copy in ma_blas 683 for (size_t k = 0; k < GA2D_.size(0); ++k) 684 for (size_t m = M0; m < Mn; ++m) 685 Gw[k][m][iw] = GA2D_[k][m]; 686 } 687 else 688 { 689 ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}), 690 GA2D_(GA2D_.extension(0), {M0, Mn})); // can be local 691 ma::product(T(OrbMats[0]), GA2D_(GA2D_.extension(0), {M0, Mn}), 692 Gfulla(Gfulla.extension(0), {M0, Mn})); // can be local 693 boost::multi::array_ref<ComplexType, 3> Gw(to_address(G.origin()), 694 {long(Gfulla.size(0)), long(Gfulla.size(1)), long(G.size(1))}); 695 // copying by hand for now, implement strided copy in ma_blas 696 for (size_t k = 0; k < Gfulla.size(0); ++k) 697 for (size_t m = M0; m < Mn; ++m) 698 Gw[k][m][iw] = Gfulla[k][m]; 699 } 700 } 701 702 boost::multi::array_ref<ComplexType, 2> Rb(Gwork.origin(), {NAEB, long(OrbMats.back().size(0))}); 703 calculate_R(local_group_comm.rank(), local_group_comm.size(), 1, abij, det_couplings[1], QQ0inv1, Qwork, 704 unique_overlaps[0], ov0, Rb); 705 local_group_comm.all_reduce_in_place_n(to_address(Rb.origin()), Rb.num_elements(), std::plus<>()); 706 if (transpose) 707 { 708 if (compact) 709 { 710 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + GAdims.first * GAdims.second, 711 {GBdims.first, GBdims.second}); 712 ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}), Gw(Gw.extension(0), {M0, Mn})); 713 } 714 else 715 { 716 boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + 717 GAdims_full.first * GAdims_full.second, 718 {GBdims_full.first, GBdims_full.second}); 719 ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}), 720 GB2D_(GB2D_.extension(0), {M0, Mn})); // can be local 721 ma::product(T(OrbMats.back()), GB2D_(GB2D_.extension(0), {M0, Mn}), // can be local 722 Gw(Gw.extension(0), {M0, Mn})); 723 } 724 } 725 else 726 { 727 if (compact) 728 { 729 ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}), 730 GB2D_(GB2D_.extension(0), {M0, Mn})); // can be local 731 boost::multi::array_ref<ComplexType, 3> Gw(to_address(G[GAdims.first * GAdims.second].origin()), 732 {GBdims.first, GBdims.second, long(G.size(1))}); 733 // copying by hand for now, implement strided copy in ma_blas 734 for (size_t k = 0; k < GB2D_.size(0); ++k) 735 for (size_t m = M0; m < Mn; ++m) 736 Gw[k][m][iw] = GB2D_[k][m]; 737 } 738 else 739 { 740 ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}), 741 GB2D_(GB2D_.extension(0), {M0, Mn})); // can be local 742 ma::product(T(OrbMats[0]), GB2D_(GB2D_.extension(0), {M0, Mn}), 743 Gfullb(Gfullb.extension(0), {M0, Mn})); // can be local 744 boost::multi::array_ref<ComplexType, 3> Gw(to_address(G[Gfulla.num_elements()].origin()), 745 {long(Gfullb.size(0)), long(Gfullb.size(1)), long(G.size(1))}); 746 // copying by hand for now, implement strided copy in ma_blas 747 for (size_t k = 0; k < Gfullb.size(0); ++k) 748 for (size_t m = M0; m < Mn; ++m) 749 Gw[k][m][iw] = Gfullb[k][m]; 750 } 751 } 752 } 753 } 754 } 755 // normalize G 756 TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nw, std::plus<>()); 757 if (transpose) 758 { 759 for (size_t iw = 0; iw < G.size(0); ++iw) 760 if (iw % TG.TG_local().size() == TG.TG_local().rank()) 761 { 762 auto ov_ = ComplexType(1.0, 0.0) / Ov[iw]; 763 ma::scal(ov_, G[iw]); 764 } 765 } 766 else 767 { 768 auto Ov_ = Ov.origin(); 769 const size_t nw_ = G.size(1); 770 for (int ik = 0; ik < G.size(0); ++ik) 771 if (ik % TG.TG_local().size() == TG.TG_local().rank()) 772 { 773 auto Gik = to_address(G[ik].origin()); 774 for (size_t iw = 0; iw < nw_; ++iw) 775 Gik[iw] /= Ov_[iw]; 776 } 777 } 778 TG.local_barrier(); 779} 780 781/* 782 * Computes the density matrix for a given reference. 783 * G and Ov are expected to be in shared memory. 784 * Simple round-robin is used. 785 */ 786template<class WlkSet, class MatA, class MatB, class MatG, class TVec> 787void PHMSD::DensityMatrix_shared(const WlkSet& wset, 788 MatA&& RefA, 789 MatB&& RefB, 790 MatG&& G, 791 TVec&& Ov, 792 bool herm, 793 bool compact, 794 bool transposed) 795{ 796 assert(G.stride(1) == 1); 797 assert(Ov.stride(0) == 1); 798 if (transposed) 799 assert(G.size(0) == wset.size() && G.size(1) == size_t(dm_size(not compact))); 800 else 801 assert(G.size(1) == wset.size() && G.size(0) == size_t(dm_size(not compact))); 802 const int nw = wset.size(); 803 assert(Ov.size() >= nw); 804 // to force synchronization before modifying structures in SHM 805 TG.local_barrier(); 806 fill_n(Ov.origin(), Ov.num_elements(), 0); 807 fill_n(G.origin(), G.num_elements(), ComplexType(0.0)); 808 TG.local_barrier(); 809 double LogOverlapFactor(wset.getLogOverlapFactor()); 810 auto Gsize = size_t(dm_size(not compact)); 811 if (localGbuff.size() < Gsize) 812 localGbuff.reextent(iextensions<1u>{Gsize}); 813 814 if (walker_type != COLLINEAR) 815 { 816 if (herm) 817 assert(RefA.size(0) == dm_dims(false, Alpha).first && RefA.size(1) == dm_dims(false, Alpha).second); 818 else 819 assert(RefA.size(1) == dm_dims(false, Alpha).first && RefA.size(0) == dm_dims(false, Alpha).second); 820 821 auto Gdims = dm_dims(not compact, Alpha); 822 CMatrix_ref G2D_(localGbuff.origin(), {Gdims.first, Gdims.second}); 823 CVector_ref G1D_(G2D_.origin(), iextensions<1u>{G2D_.num_elements()}); 824 825 for (int iw = 0; iw < nw; ++iw) 826 { 827 if (iw % TG.TG_local().size() != TG.TG_local().rank()) 828 continue; 829 Ov[iw] = SDetOp.MixedDensityMatrix(RefA, *wset[iw].SlaterMatrix(Alpha), G2D_, LogOverlapFactor, compact, herm); 830 if (walker_type == CLOSED) 831 Ov[iw] *= ComplexType(Ov[iw]); 832 if (transposed) 833 G[iw] = G1D_; 834 else 835 G(G.extension(0), iw) = G1D_; 836 } 837 } 838 else 839 { 840 if (herm) 841 assert(RefA.size(0) == dm_dims(false, Alpha).first && RefA.size(1) == dm_dims(false, Alpha).second); 842 else 843 assert(RefA.size(1) == dm_dims(false, Alpha).first && RefA.size(0) == dm_dims(false, Alpha).second); 844 if (herm) 845 assert(RefB.size(0) == dm_dims(false, Beta).first && RefB.size(1) == dm_dims(false, Beta).second); 846 else 847 assert(RefB.size(1) == dm_dims(false, Beta).first && RefB.size(0) == dm_dims(false, Beta).second); 848 849 if (ovlp2.size(0) < 2 * nw) 850 ovlp2.reextent(iextensions<1u>{2 * nw}); 851 fill_n(ovlp2.origin(), 2 * nw, ComplexType(0.0)); 852 auto GAdims = dm_dims(not compact, Alpha); 853 auto GBdims = dm_dims(not compact, Beta); 854 CMatrix_ref GA2D_(localGbuff.origin(), {GAdims.first, GAdims.second}); 855 CMatrix_ref GB2D_(GA2D_.origin() + GA2D_.num_elements(), {GBdims.first, GBdims.second}); 856 CVector_ref GA1D_(GA2D_.origin(), iextensions<1u>{GA2D_.num_elements()}); 857 CVector_ref GB1D_(GB2D_.origin(), iextensions<1u>{GB2D_.num_elements()}); 858 859 for (int iw = 0; iw < 2 * nw; ++iw) 860 { 861 if (iw % TG.TG_local().size() != TG.TG_local().rank()) 862 continue; 863 864 if (iw % 2 == 0) 865 { 866 ovlp2[iw] = 867 SDetOp.MixedDensityMatrix(RefA, *wset[iw / 2].SlaterMatrix(Alpha), GA2D_, LogOverlapFactor, compact, herm); 868 if (transposed) 869 G[iw / 2].sliced(0, GAdims.first * GAdims.second) = GA1D_; 870 else 871 G({0, GAdims.first * GAdims.second}, iw / 2) = GA1D_; 872 } 873 else 874 { 875 ovlp2[iw] = 876 SDetOp.MixedDensityMatrix(RefB, *wset[iw / 2].SlaterMatrix(Beta), GB2D_, LogOverlapFactor, compact, herm); 877 if (transposed) 878 G[iw / 2].sliced(GAdims.first * GAdims.second, G.size(1)) = GB1D_; 879 else 880 G({GAdims.first * GAdims.second, G.size(0)}, iw / 2) = GB1D_; 881 } 882 } 883 if (TG.TG_local().size() > 1) 884 TG.TG_local().all_reduce_in_place_n(to_address(ovlp2.origin()), 2 * nw, std::plus<>()); 885 if (TG.TG_local().root()) 886 for (int iw = 0; iw < nw; ++iw) 887 Ov[iw] = ovlp2[2 * iw] * ovlp2[2 * iw + 1]; 888 } 889 TG.local_barrier(); 890} 891 892template<class MatA, class MatB, class MatG, class TVec> 893void PHMSD::DensityMatrix_shared(std::vector<MatA>& Left, 894 std::vector<MatB>& Right, 895 std::vector<MatG>& G, 896 TVec&& Ov, 897 double LogOverlapFactor, 898 bool herm, 899 bool compact) 900{ 901 const int nw = Left.size(); 902 assert(Right.size() == nw); 903 assert(G.size() == nw); 904 assert(Ov.size() >= nw); 905 // to force synchronization before modifying structures in SHM 906 TG.local_barrier(); 907 for (int iw = 0; iw < nw; ++iw) 908 { 909 if (iw % TG.TG_local().size() != TG.TG_local().rank()) 910 continue; 911 Ov[iw] = SDetOp.MixedDensityMatrix(*Left[iw], *Right[iw], *G[iw], LogOverlapFactor, compact, herm); 912 } 913 TG.local_barrier(); 914} 915 916/* 917 * TODO: Implement. 918 */ 919template<class WlkSet, class MatG, class CVec1, class CVec2, class Mat1, class Mat2> 920void PHMSD::WalkerAveragedDensityMatrix(const WlkSet& wset, 921 CVec1& wgt, 922 MatG& G, 923 CVec2& denom, 924 Mat1&& Ovlp, 925 Mat2&& DMsum, 926 bool free_projection, 927 boost::multi::array_ref<ComplexType, 3>* Refs, 928 boost::multi::array<ComplexType, 2>* detR) 929{ 930 APP_ABORT(" Error: Back Propagation not implemented for PHMSD. \n"); 931} 932 933/* 934 * Calculates the overlaps of all walkers in the set. Returns values in arrays. 935 * Ov is assumed to be local to the core 936 */ 937template<class WlkSet, class TVec> 938void PHMSD::Overlap(const WlkSet& wset, TVec&& Ov) 939{ 940 const int nw = wset.size(); 941 assert(Ov.size() >= nw); 942 std::fill(Ov.begin(), Ov.begin() + nw, 0); 943 auto refc = abij.reference_configuration(); 944 double LogOverlapFactor(wset.getLogOverlapFactor()); 945 if (walker_type != COLLINEAR) 946 { 947 APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for CLOSED and NONCOLLINEAR. \n"); 948 const int ntasks_percore = nw / TG.getNCoresPerTG(); 949 const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG(); 950 const int nextra = nw - ntasks_total_serial; 951 952 // each processor does ntasks_percore_serial overlaps serially 953 const int w0 = TG.getLocalTGRank() * ntasks_percore; 954 const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore; 955 956 // task_w_d = = wlk_w*ndet + d 957 ComplexType ov0; 958 for (int iw = w0; iw < wN; ++iw) 959 { 960 ov0 = SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc, local_QQ0inv0); 961 local_ov[0][0] = 1.0; 962 calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]); 963 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it) 964 { 965 Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] * 966 ((walker_type == CLOSED) ? (ov0 * local_ov[0][std::get<0>(*it)]) : (ComplexType(1.0, 0.0))); 967 } 968 } 969 if (nextra > 0) 970 { 971 /* 972 // check if new communicator is necessary 973 if( last_number_extra_tasks != nextra ) { 974 last_number_extra_tasks = nextra; 975 for(int n=0; n<nextra; n++) { 976 int n0,n1; 977 std::tie(n0,n1) = FairDivideBoundary(n,TG.getNCoresPerTG(),nextra); 978 if(TG.getLocalTGRank()>=n0 && TG.getLocalTGRank()<n1) { 979 last_task_index = n; 980 break; 981 } 982 } 983 // first setup 984 local_group_comm = shared_communicator(TG.TG_local().split(last_task_index, 985 TG.TG_local().size())); 986 { 987 shmCMatrix _ov_({2,maxn_unique_confg}, 988 shared_allocator<ComplexType>{local_group_comm}); 989 unique_overlaps.swap(_ov_); 990 } 991 { 992 shmC3Tensor _qq0inv_({1,maxnactive,size_t(NAEA)}, 993 shared_allocator<ComplexType>{local_group_comm}); 994 QQ0inv.swap(_qq0inv_); 995 } 996 } 997 if(last_task_index < 0 || last_task_index > nextra) 998 APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)"); 999 { 1000 if(local_group_comm.rank()==0) unique_overlaps[0][0] = 1.0; 1001 local_group_comm.barrier(); 1002 int iw = (last_task_index+ntasks_total_serial); 1003 ComplexType ov; 1004 SDetOp.OverlapForWoodbury(OrbMats[0],*wset[iw].SlaterMatrix(Alpha),std::addressof(ov),refc,QQ0inv0,local_group_comm); 1005 calculate_overlaps(local_group_comm.rank(),local_group_comm.size(),0,abij,QQ0inv0,Qwork,unique_overlaps[0]); 1006 local_group_comm.barrier(); 1007 int cnt=0; 1008 for(auto it=abij.configurations_begin(); it<abij.configurations_end(); ++it, cnt++) 1009 if(cnt%local_group_comm.size()==local_group_comm.rank()) 1010 Ov[iw] += ma::conj(std::get<2>(*it))*ov* 1011 unique_overlaps[0][std::get<0>(*it)]* 1012 ((walker_type==CLOSED)?(ov*unique_overlaps[0][std::get<0>(*it)]):(ComplexType(1.0,0.0))); 1013 } 1014*/ 1015 } 1016 } 1017 else 1018 { 1019 const int ntasks_percore = nw / TG.getNCoresPerTG(); 1020 const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG(); 1021 const int nextra = nw - ntasks_total_serial; 1022 1023 // each processor does ntasks_percore_serial overlaps serially 1024 const int w0 = TG.getLocalTGRank() * ntasks_percore; 1025 const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore; 1026 1027 // task_w_d = = wlk_w*ndet + d 1028 local_ov[0][0] = 1.0; 1029 local_ov[1][0] = 1.0; 1030 for (int iw = w0; iw < wN; ++iw) 1031 { 1032 ComplexType ov0 = 1033 SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc, local_QQ0inv0); 1034 calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]); 1035 ov0 *= SDetOp.OverlapForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), LogOverlapFactor, refc + NAEA, 1036 local_QQ0inv1); 1037 calculate_overlaps(0, 1, 1, abij, local_QQ0inv1, Qwork, local_ov[1]); 1038 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it) 1039 { 1040 Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] * local_ov[1][std::get<1>(*it)]; 1041 } 1042 } 1043 1044 // all remaining overlaps are performed in parallel with blocks of cores 1045 // partition processors in nextra groups 1046 if (nextra > 0) 1047 { 1048 // check if new communicator is necessary 1049 if (last_number_extra_tasks != nextra) 1050 { 1051 last_number_extra_tasks = nextra; 1052 for (int n = 0; n < nextra; n++) 1053 { 1054 int n0, n1; 1055 std::tie(n0, n1) = FairDivideBoundary(n, TG.getNCoresPerTG(), nextra); 1056 if (TG.getLocalTGRank() >= n0 && TG.getLocalTGRank() < n1) 1057 { 1058 last_task_index = n; 1059 break; 1060 } 1061 } 1062 // reset 1063 local_group_comm = shared_communicator(TG.TG_local().split(last_task_index, TG.TG_local().size())); 1064 auto GAdims0 = dm_dims_ref(false, Alpha); 1065 auto GBdims0 = dm_dims_ref(false, Beta); 1066 { 1067 shmCMatrix unique_overlaps_({2, maxn_unique_confg}, shared_allocator<ComplexType>{local_group_comm}); 1068 unique_overlaps.swap(unique_overlaps_); 1069 } 1070 { 1071 shmCMatrix QQ0inv0_({OrbMats[0].size(0), NAEA}, shared_allocator<ComplexType>{local_group_comm}); 1072 QQ0inv0.swap(QQ0inv0_); 1073 } 1074 { 1075 shmCMatrix QQ0inv1_({OrbMats.back().size(0), NAEB}, shared_allocator<ComplexType>{local_group_comm}); 1076 QQ0inv1.swap(QQ0inv1_); 1077 } 1078 // don't need them here, but allocate anyway to avoid dimension check every time 1079 { 1080 shmCMatrix GA2D0_shm_({GAdims0.first, GAdims0.second}, shared_allocator<ComplexType>{local_group_comm}); 1081 GA2D0_shm.swap(GA2D0_shm_); 1082 } 1083 { 1084 shmCMatrix GB2D0_shm_({GBdims0.first, GBdims0.second}, shared_allocator<ComplexType>{local_group_comm}); 1085 GB2D0_shm.swap(GB2D0_shm_); 1086 } 1087 local_group_comm.barrier(); 1088 } 1089 if (last_task_index < 0 || last_task_index > nextra) 1090 APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)"); 1091 { 1092 if (local_group_comm.rank() == 0) 1093 unique_overlaps[0][0] = 1.0; 1094 if (local_group_comm.rank() == 0) 1095 unique_overlaps[1][0] = 1.0; 1096 local_group_comm.barrier(); 1097 int iw = (last_task_index + ntasks_total_serial); 1098 ComplexType ov = SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc, 1099 QQ0inv0, local_group_comm); 1100 calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 0, abij, QQ0inv0, Qwork, 1101 unique_overlaps[0]); 1102 ov *= SDetOp.OverlapForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), LogOverlapFactor, refc + NAEA, 1103 QQ0inv1, local_group_comm); 1104 calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 1, abij, QQ0inv1, Qwork, 1105 unique_overlaps[1]); 1106 local_group_comm.barrier(); 1107 size_t cnt = 0; 1108 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it, cnt++) 1109 { 1110 if (cnt % local_group_comm.size() == local_group_comm.rank()) 1111 Ov[iw] += ma::conj(std::get<2>(*it)) * ov * unique_overlaps[0][std::get<0>(*it)] * 1112 unique_overlaps[1][std::get<1>(*it)]; 1113 } 1114 } 1115 } 1116 } 1117 TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nw, std::plus<>()); 1118} 1119 1120/* 1121 * Orthogonalizes the Slater matrices of all walkers in the set. 1122 * Options: 1123 * - bool importanceSamplingt(default=true): use algorithm appropriate for importance sampling. 1124 * This means that the determinant of the R matrix in the QR decomposition is ignored. 1125 * If false, add the determinant of R to the weight of the walker. 1126 */ 1127template<class WlkSet> 1128void PHMSD::Orthogonalize(WlkSet& wset, bool impSamp) 1129{ 1130 ComplexType detR(1.0, 0.0); 1131 double LogOverlapFactor(wset.getLogOverlapFactor()); 1132 if (walker_type != COLLINEAR) 1133 { 1134 int cnt = 0; 1135 for (typename WlkSet::iterator it = wset.begin(); it != wset.end(); ++it) 1136 { 1137 if ((cnt++) % TG.getNCoresPerTG() == TG.getLocalTGRank()) 1138 { 1139 if (excitedState && numExcitations.first > 0) 1140 OrthogonalizeExcited(*it->SlaterMatrix(Alpha), Alpha, LogOverlapFactor); 1141 else 1142 detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Alpha), LogOverlapFactor); 1143 if (!impSamp) 1144 { 1145 if (walker_type == CLOSED) 1146 *it->weight() *= (detR * detR); 1147 else 1148 *it->weight() *= detR; 1149 } 1150 } 1151 } 1152 } 1153 else 1154 { 1155 int cnt = 0; 1156 for (typename WlkSet::iterator it = wset.begin(); it != wset.end(); ++it) 1157 { 1158 if ((2 * (cnt++)) % TG.getNCoresPerTG() == TG.getLocalTGRank()) 1159 { 1160 if (excitedState && numExcitations.first > 0) 1161 OrthogonalizeExcited(*it->SlaterMatrix(Alpha), Alpha, LogOverlapFactor); 1162 else 1163 detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Alpha), LogOverlapFactor); 1164 if (!impSamp) 1165 { 1166 std::lock_guard<shared_mutex> guard(*mutex); 1167 *it->weight() *= detR; 1168 } 1169 } 1170 if ((2 * (cnt++) + 1) % TG.getNCoresPerTG() == TG.getLocalTGRank()) 1171 { 1172 if (excitedState && numExcitations.second > 0) 1173 OrthogonalizeExcited(*it->SlaterMatrix(Beta), Beta, LogOverlapFactor); 1174 else 1175 detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Beta), LogOverlapFactor); 1176 if (!impSamp) 1177 { 1178 std::lock_guard<shared_mutex> guard(*mutex); 1179 *it->weight() *= detR; 1180 } 1181 } 1182 } 1183 } 1184 TG.local_barrier(); 1185 // recalculate overlaps 1186 Overlap(wset); 1187} 1188 1189/* 1190 * Orthogonalize extended Slater Matrix for excited states calculation 1191 * Ret 1192 */ 1193template<class Mat> 1194void PHMSD::OrthogonalizeExcited(Mat&& A, SpinTypes spin, double LogOverlapFactor) 1195{ 1196 if (walker_type == NONCOLLINEAR) 1197 APP_ABORT(" Error: OrthogonalizeExcited not implemented with NONCOLLINEAR.\n"); 1198 if (spin == Alpha) 1199 { 1200 if (extendedMatAlpha.size(0) != NMO || extendedMatAlpha.size(1) != maxOccupExtendedMat.first) 1201 extendedMatAlpha.reextent({NMO, maxOccupExtendedMat.first}); 1202 extendedMatAlpha(extendedMatAlpha.extension(0), {0, NAEA}) = A; 1203 extendedMatAlpha(extendedMatAlpha.extension(0), {NAEA + 1, maxOccupExtendedMat.first}) = 1204 excitedOrbMat[0](excitedOrbMat.extension(1), {NAEA + 1, maxOccupExtendedMat.first}); 1205 // move i->a, copy trial orb i 1206 for (auto& i : excitations) 1207 if (i.first < NMO && i.second < NMO) 1208 { 1209 extendedMatAlpha(extendedMatAlpha.extension(0), i.second) = 1210 extendedMatAlpha(extendedMatAlpha.extension(0), i.first); 1211 extendedMatAlpha(extendedMatAlpha.extension(0), i.first) = 1212 excitedOrbMat[0](excitedOrbMat.extension(1), i.first); 1213 } 1214 ComplexType det = SDetOp.Orthogonalize(extendedMatAlpha, LogOverlapFactor); 1215 A(A.extension(0), {0, NAEA}) = extendedMatAlpha(extendedMatAlpha.extension(0), {0, NAEA}); 1216 for (auto& i : excitations) 1217 if (i.first < NMO && i.second < NMO) 1218 A(A.extension(0), i.first) = extendedMatAlpha(extendedMatAlpha.extension(0), i.second); 1219 } 1220 else 1221 { 1222 if (extendedMatBeta.size(0) != NMO || extendedMatBeta.size(1) != maxOccupExtendedMat.second) 1223 extendedMatBeta.reextent({NMO, maxOccupExtendedMat.second}); 1224 extendedMatBeta(extendedMatBeta.extension(0), {0, NAEB}) = A; 1225 extendedMatBeta(extendedMatBeta.extension(0), {NAEB + 1, maxOccupExtendedMat.second}) = 1226 excitedOrbMat[1](excitedOrbMat.extension(1), {NAEB + 1, maxOccupExtendedMat.second}); 1227 // move i->a, copy trial orb i 1228 for (auto& i : excitations) 1229 if (i.first >= NMO && i.second >= NMO) 1230 { 1231 extendedMatBeta(extendedMatBeta.extension(0), i.second) = 1232 extendedMatBeta(extendedMatBeta.extension(0), i.first); 1233 extendedMatBeta(extendedMatBeta.extension(0), i.first) = excitedOrbMat[1](excitedOrbMat.extension(1), i.first); 1234 } 1235 ComplexType detR = SDetOp.Orthogonalize(extendedMatBeta, LogOverlapFactor); 1236 A(A.extension(0), {0, NAEB}) = extendedMatBeta(extendedMatBeta.extension(0), {0, NAEB}); 1237 for (auto& i : excitations) 1238 if (i.first >= NMO && i.second >= NMO) 1239 A(A.extension(0), i.first) = extendedMatBeta(extendedMatBeta.extension(0), i.second); 1240 } 1241} 1242 1243/* 1244 * Calculate mean field expectation value of Cholesky potentials 1245 * Can put G in shared memory with proper synchronization to avoid duplicated memory 1246 * at the expense of synchronization overhead 1247 */ 1248template<class Vec> 1249void PHMSD::vMF(Vec&& v) 1250{ 1251 if (walker_type == NONCOLLINEAR) 1252 APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for NONCOLLINEAR. \n"); 1253 1254 using ma::conj; 1255 using ma::T; 1256 using std::get; 1257 using std::norm; 1258 assert(v.num_elements() == local_number_of_cholesky_vectors()); 1259 std::fill_n(v.origin(), v.num_elements(), ComplexType(0)); 1260 auto Gsize = size_t(dm_size(false)); 1261 if (localGbuff.size() < Gsize) 1262 localGbuff.reextent(iextensions<1u>{Gsize}); 1263 1264 //shmCMatrix ovlps({2,maxn_unique_confg},shared_allocator<ComplexType>{TG.Node()}); 1265 //boost::multi::array<ComplexType,2> ovlps({2,maxn_unique_confg}); 1266 boost::multi::array<ComplexType, 2> ovlps({2, maxn_unique_confg}); 1267 //if(TG.Node().root()) 1268 std::fill_n(to_address(ovlps.origin()), 2 * maxn_unique_confg, ComplexType(0.0)); 1269 1270 auto refc = abij.reference_configuration(); 1271 auto confgs = abij.configurations_begin(); 1272 std::vector<int> exct(2 * NAEA); 1273 std::vector<int> Iwork(2 * NAEA); 1274 std::vector<int> confg(NAEA); 1275 std::vector<int> confgB(NAEA); 1276 TG.Node().barrier(); 1277 1278 // 1. Overlaps 1279 for (int spin = 0, nc = 0; spin < 2; ++spin) 1280 { 1281 int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0; 1282 int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin; 1283 confg.resize((spin == 0) ? NAEA : NAEB); 1284 auto Gdims = dm_dims(false, SpinTypes(spin)); 1285 auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin)); 1286 boost::multi::array<ComplexType, 2> SM_({Gdims_ref.second, Gdims_ref.first}); 1287 for (int nd = 0; nd < det_couplings[spin].size(); ++nd, ++nc) 1288 { 1289 if (nc % TG.Global().size() == TG.Global().rank()) 1290 { 1291 abij.get_configuration(spin, nd, confg); 1292 ma::Matrix2MA('H', OrbMats[orb_spin_indx], SM_, confg); 1293 ovlps[spin][nd] = SDetOp.Overlap(SM_, 0.0); 1294 } 1295 } 1296 } 1297 TG.Node().barrier(); 1298 //if(TG.Node().root()) 1299 // TG.Cores().all_reduce_in_place_n(to_address(ovlps.origin()),ovlps.num_elements(),std::plus<>()); 1300 TG.Global().all_reduce_in_place_n(to_address(ovlps.origin()), ovlps.num_elements(), std::plus<>()); 1301 TG.Node().barrier(); 1302 1303 ComplexType ov(0.0); 1304 for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it) 1305 ov += norm(std::get<2>(*it)) * ovlps[0][std::get<0>(*it)] * ovlps[1][std::get<1>(*it)]; 1306 1307 // 2. Diagonal and off-diagonal components 1308 for (int spin = 0; spin < 2; ++spin) 1309 { 1310 int other_spin = 1 - spin; 1311 int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0; 1312 int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin; 1313 confg.resize((spin == 0) ? NAEA : NAEB); 1314 auto Gdims = dm_dims(false, SpinTypes(spin)); 1315 auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin)); 1316 boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), {Gdims_ref.first, Gdims_ref.second}); 1317 boost::multi::array_ref<ComplexType, 1> G1D_(G2D_.origin(), iextensions<1u>{G2D_.num_elements()}); 1318 boost::multi::array<ComplexType, 2> SM_({Gdims_ref.second, Gdims_ref.first}); 1319 // store mean field G 1320 boost::multi::array<ComplexType, 2> G({Gdims.first, Gdims.second}); 1321 std::fill_n(G.origin(), G.num_elements(), ComplexType(0.0)); 1322 1323 // diagonal contribution 1324 for (int nd = 0; nd < det_couplings[spin].size(); ++nd) 1325 { 1326 if (nd % TG.Global().size() == TG.Global().rank()) 1327 { 1328 abij.get_configuration(spin, nd, confg); 1329 ma::Matrix2MA('H', OrbMats[orb_spin_indx], SM_, confg); 1330 ComplexType ov_ = SDetOp.MixedDensityMatrix(SM_, G2D_, 0.0, true); 1331 ComplexType wgt(0.0); 1332 auto it = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_begin(nd)); 1333 auto ite = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_end(nd)); 1334 for (; it < ite; ++it) 1335 wgt += ovlps[other_spin][get_index(*(confgs + (*it)), other_spin)] * norm(get<2>(*(confgs + (*it)))); 1336 wgt *= ovlps[spin][nd]; 1337 for (int k = 0; k < confg.size(); ++k) 1338 ma::axpy(wgt, G2D_[k], G[confg[k]]); 1339 } 1340 } 1341 // off-diagonal contribution 1342 boost::multi::array<ComplexType, 2> orbs({2, Gdims.second}); 1343 confgB.resize((spin == 0) ? NAEA : NAEB); 1344 ComplexType dummy(0.0); 1345 for (int nd = 0; nd < det_couplings[other_spin].size(); ++nd) 1346 { 1347 if (nd % TG.Global().size() == TG.Global().rank()) 1348 { 1349 auto it1 = to_address(det_couplings[other_spin].values()) + (*det_couplings[other_spin].pointers_begin(nd)); 1350 auto ite = to_address(det_couplings[other_spin].values()) + (*det_couplings[other_spin].pointers_end(nd)); 1351 for (; it1 < ite; ++it1) 1352 { 1353 auto ci1 = get<2>(*(confgs + (*it1))); 1354 size_t cf1 = get_index(*(confgs + (*it1)), spin); 1355 abij.get_configuration(spin, cf1, confg); 1356 sort(confg.begin(), confg.end()); 1357 for (auto it2 = it1 + 1; it2 < ite; ++it2) 1358 { 1359 size_t cf2 = get_index(*(confgs + (*it2)), spin); 1360 abij.get_configuration(spin, cf2, confgB); 1361 sort(confgB.begin(), confgB.end()); 1362 exct.clear(); 1363 int np = get_excitation_number(true, confg, confgB, exct, dummy, Iwork); 1364 if (np == 1) 1365 { 1366 ComplexType wgt = ma::conj(ci1) * get<2>(*(confgs + (*it2))); 1367 /* 1368 * exct: [0]: location of orbital being excited, [1]: excited orbital 1369 * confg[exct[0]]: occupied orbital being excited 1370 * WARNING!!! Assumes orthogonal states, needs overlap factor!!! 1371 * Either calculate it (expensive) or demand orthogonality!!! 1372 */ 1373 exct[0] = confg[exct[0]]; 1374 ma::Matrix2MA('Z', OrbMats[orb_spin_indx], orbs, exct); 1375 ma::axpy(wgt, orbs[0], G[exct[1]]); 1376 ma::axpy(ma::conj(wgt), orbs[1], G[exct[0]]); 1377 } 1378 } 1379 } 1380 } 1381 } 1382 boost::multi::array_ref<ComplexType, 1> G1D(G.origin(), iextensions<1u>{G.num_elements()}); 1383 TG.Global().all_reduce_in_place_n(to_address(G1D.origin()), G1D.num_elements(), std::plus<>()); 1384 ma::scal(ComplexType(1.0 / ov), G1D); 1385 HamOp.vbias(G1D, std::forward<Vec>(v), 0.5, 1.0); 1386 } 1387 TG.Node().barrier(); 1388 1389 // since v is not in shared memory, we need to reduce 1390 TG.TG_local().all_reduce_in_place_n(to_address(v.origin()), v.num_elements(), std::plus<>()); 1391 // NOTE: since SpvnT is a truncated structure the complex part of vMF, 1392 // which should be exactly zero, suffers from truncation errors. 1393 // Set it to zero. 1394 ma::zero_complex_part(v); 1395 // for(int i=0; i<v.num_elements(); i++) 1396 // v[i] = ComplexType(real(v[i]),0.0); 1397} 1398 1399 1400} // namespace afqmc 1401} // namespace qmcplusplus 1402