1 ////////////////////////////////////////////////////////////////////// 2 // This file is distributed under the University of Illinois/NCSA Open Source 3 // License. See LICENSE file in top directory for details. 4 // 5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. 6 // 7 // File developed by: 8 // Miguel A. Morales, moralessilva2@llnl.gov 9 // Lawrence Livermore National Laboratory 10 // 11 // File created by: 12 // Miguel A. Morales, moralessilva2@llnl.gov 13 // Lawrence Livermore National Laboratory 14 //////////////////////////////////////////////////////////////////////////////// 15 16 #ifndef QMCPLUSPLUS_AFQMC_REALSPACE_CORRELATORS_HPP 17 #define QMCPLUSPLUS_AFQMC_REALSPACE_CORRELATORS_HPP 18 19 #include "AFQMC/config.h" 20 #include <vector> 21 #include <string> 22 #include <iostream> 23 24 #include "hdf/hdf_multi.h" 25 #include "hdf/hdf_archive.h" 26 #include "OhmmsData/libxmldefs.h" 27 #include "Utilities/Timer.h" 28 29 #include "AFQMC/Walkers/WalkerSet.hpp" 30 #include "AFQMC/Numerics/detail/utilities.hpp" 31 #include "AFQMC/Numerics/ma_operations.hpp" 32 #include "AFQMC/Numerics/batched_operations.hpp" 33 #include "AFQMC/Memory/buffer_managers.h" 34 35 namespace qmcplusplus 36 { 37 namespace afqmc 38 { 39 /* 40 * Observable class that calculates the walker averaged on-top pair density 41 * Alloc defines the allocator type used to store the orbital and temporary tensors. 42 * In a device compilation, this would need out-of-card gemm available. 43 */ 44 class realspace_correlators : public AFQMCInfo 45 { 46 // allocators 47 using Allocator = localTG_allocator<ComplexType>; 48 49 // type defs 50 using pointer = device_ptr<ComplexType>; 51 52 using CVector_ref = boost::multi::array_ref<ComplexType, 1, pointer>; 53 using CMatrix_ref = boost::multi::array_ref<ComplexType, 2, pointer>; 54 using CTensor_ref = boost::multi::array_ref<ComplexType, 3, pointer>; 55 using C4Tensor_ref = boost::multi::array_ref<ComplexType, 4, pointer>; 56 using CVector = boost::multi::array<ComplexType, 1, Allocator>; 57 using CMatrix = boost::multi::array<ComplexType, 2, Allocator>; 58 using stdCVector = boost::multi::array<ComplexType, 1>; 59 using stdCMatrix = boost::multi::array<ComplexType, 2>; 60 using stdIVector = boost::multi::array<int, 1>; 61 using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>; 62 using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>; 63 using mpi3CVector = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>; 64 using mpi3CMatrix = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>; 65 using mpi3CTensor = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>; 66 using mpi3C4Tensor = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>; 67 68 using shm_stack_alloc_type = LocalTGBufferManager::template allocator_t<ComplexType>; 69 using StaticMatrix = boost::multi::static_array<ComplexType, 2, shm_stack_alloc_type>; 70 71 public: realspace_correlators(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,int nave_=1,int bsize=1)72 realspace_correlators(afqmc::TaskGroup_& tg_, 73 AFQMCInfo& info, 74 xmlNodePtr cur, 75 WALKER_TYPES wlk, 76 int nave_ = 1, 77 int bsize = 1) 78 : AFQMCInfo(info), 79 alloc(make_localTG_allocator<ComplexType>(tg_)), 80 block_size(bsize), 81 nave(nave_), 82 counter(0), 83 TG(tg_), 84 walker_type(wlk), 85 dm_size(0), 86 writer(false), 87 Orbitals({0, 0}, alloc), 88 DMAverage({0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 89 DMWork({0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 90 denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}), 91 Gr_host({0, 0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 92 Orrp({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}) 93 { 94 app_log() << " -- Adding Back Propagated real-space off diagonal 2RDM (realspace_correlators). -- \n "; 95 std::string orb_file(""); 96 std::string str("false"); 97 98 if (cur != NULL) 99 { 100 ParameterSet m_param; 101 m_param.add(orb_file, "orbitals"); 102 m_param.put(cur); 103 } 104 105 if (not file_exists(orb_file)) 106 { 107 app_error() << " Error: orbitals file does not exist: " << orb_file << std::endl; 108 APP_ABORT(""); 109 } 110 111 stdIVector norbs(iextensions<1u>{0}); 112 dm_size = 0; 113 int npoints = 0; 114 115 // read orbitals 116 hdf_archive dump; 117 if (TG.Node().root()) 118 { 119 if (!dump.open(orb_file, H5F_ACC_RDONLY)) 120 { 121 app_error() << " Error opening orbitals file for realspace_correlators estimator. \n"; 122 APP_ABORT(""); 123 } 124 if (!dump.push("OrbsR", false)) 125 { 126 app_error() << " Error in realspace_correlators: Group OrbsR not found." << std::endl; 127 APP_ABORT(""); 128 } 129 // read one orbital to check size and later corroborate all orbitals have same size 130 stdCVector orb(iextensions<1u>{1}); 131 if (!dump.readEntry(orb, "kp0_b0")) 132 { 133 app_error() << " Error in realspace_correlators: Problems reading orbital: 0 0" << std::endl; 134 APP_ABORT(""); 135 } 136 npoints = orb.size(0); 137 if (npoints < 1) 138 { 139 app_error() << " Error in realspace_correlators: npoints < 1. " << std::endl; 140 APP_ABORT(""); 141 } 142 TG.Node().broadcast_n(&npoints, 1, 0); 143 if (!dump.readEntry(norbs, "number_of_orbitals")) 144 { 145 app_error() << " Error in realspace_correlators: Problems reading number_of_orbitals. " << std::endl; 146 APP_ABORT(""); 147 } 148 int M = 0, nk = norbs.size(); 149 for (int i = 0; i < nk; i++) 150 M += norbs[i]; 151 if (M != NMO) 152 { 153 app_error() << " Error in realspace_correlators: Inconsistent number of orbitals in file: " << M 154 << " expected:" << NMO << std::endl; 155 APP_ABORT(""); 156 } 157 TG.Node().broadcast_n(&npoints, 1, 0); 158 Orbitals = CMatrix({NMO, npoints}, alloc); 159 // host copy to calculate Orrp 160 stdCMatrix host_orb({NMO, npoints}); 161 Orrp = mpi3CMatrix({npoints, npoints}, shared_allocator<ComplexType>{TG.TG_local()}); 162 for (int k = 0, kn = 0; k < nk; k++) 163 { 164 for (int i = 0; i < norbs[k]; i++, kn++) 165 { 166 if (!dump.readEntry(orb, "kp" + std::to_string(k) + "_b" + std::to_string(i))) 167 { 168 app_error() << " Error in realspace_correlators: Problems reading orbital: " << k << " " << i << std::endl; 169 APP_ABORT(""); 170 } 171 if (orb.size(0) != npoints) 172 { 173 app_error() << " Error in realspace_correlators: Inconsistent orbital size: " << k << " " << i << std::endl; 174 APP_ABORT(""); 175 } 176 using std::copy_n; 177 copy_n(orb.origin(), npoints, ma::pointer_dispatch(Orbitals[kn].origin())); 178 copy_n(orb.origin(), npoints, host_orb[kn].origin()); 179 } 180 } 181 dump.pop(); 182 dump.close(); 183 184 // Or(r,r') = sum_j conj(psi(j,r)) * psi(j,r') 185 ma::product(ma::H(host_orb), host_orb, Orrp); 186 187 app_log() << " Number of grid points: " << npoints << std::endl; 188 } 189 else 190 { 191 TG.Node().broadcast_n(&npoints, 1, 0); 192 Orbitals = CMatrix({NMO, npoints}, alloc); 193 Orrp = mpi3CMatrix({npoints, npoints}, shared_allocator<ComplexType>{TG.TG_local()}); 194 } 195 dm_size = npoints * npoints; 196 TG.Node().barrier(); 197 198 using std::fill_n; 199 writer = (TG.getGlobalRank() == 0); 200 201 if (writer) 202 { 203 type_id.reserve(3); 204 type_id.emplace_back("CC"); 205 type_id.emplace_back("SS"); 206 type_id.emplace_back("CS"); 207 } 208 209 DMAverage = mpi3CTensor({nave, 3, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 210 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 211 } 212 213 template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3> accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)214 void accumulate_reference(int iav, 215 int iref, 216 MatG&& G, 217 MatG_host&& G_host, 218 HostCVec1&& wgt, 219 HostCVec2&& Xw, 220 HostCVec3&& ovlp, 221 bool impsamp) 222 { 223 static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality"); 224 static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality"); 225 using ma::gemmBatched; 226 using std::copy_n; 227 using std::fill_n; 228 // assumes G[nwalk][spin][M][M] 229 int nw(G.size(0)); 230 int npts(Orbitals.size(1)); 231 assert(G.size(0) == wgt.size(0)); 232 assert(wgt.size(0) == nw); 233 assert(Xw.size(0) == nw); 234 assert(ovlp.size(0) >= nw); 235 assert(G.num_elements() == G_host.num_elements()); 236 assert(G.extensions() == G_host.extensions()); 237 238 int nsp; 239 if (walker_type == CLOSED) 240 nsp = 1; 241 else 242 nsp = 2; 243 244 // check structure dimensions 245 if (iref == 0) 246 { 247 if (denom.size(0) != nw) 248 { 249 denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()}); 250 } 251 if (DMWork.size(0) != nw || DMWork.size(1) != 3 || DMWork.size(2) != dm_size) 252 { 253 DMWork = mpi3CTensor({nw, 3, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 254 } 255 if (Gr_host.size(0) != nw || Gr_host.size(1) != nsp || Gr_host.size(2) != npts || Gr_host.size(3) != npts) 256 { 257 Gr_host = mpi3C4Tensor({nw, nsp, npts, npts}, shared_allocator<ComplexType>{TG.TG_local()}); 258 } 259 fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0)); 260 fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0)); 261 } 262 else 263 { 264 if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != 3 || DMWork.size(2) != dm_size || 265 Gr_host.size(0) != nw || Gr_host.size(1) != nsp || Gr_host.size(2) != npts || Gr_host.size(3) != npts || 266 DMAverage.size(0) != nave || DMAverage.size(1) != 3 || DMAverage.size(2) != dm_size) 267 APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n"); 268 } 269 270 271 // calculate green functions in real space and send to host for processing/accumulation 272 // if memory becomes a problem, then batch over walkers 273 { 274 LocalTGBufferManager buffer_manager; 275 StaticMatrix T({nw * nsp * NMO, npts}, buffer_manager.get_generator().template get_allocator<ComplexType>()); 276 StaticMatrix Gr({nsp * nw, npts * npts}, buffer_manager.get_generator().template get_allocator<ComplexType>()); 277 CTensor_ref Gr3D(make_device_ptr(Gr.origin()), {nw, nsp, npts * npts}); 278 CTensor_ref T3D(make_device_ptr(T.origin()), {nw, nsp, NMO * npts}); 279 CMatrix_ref G2D(make_device_ptr(G.origin()), {nw * nsp * NMO, NMO}); 280 281 // T1[iw][ispin][i][r] = sum_j G[iw][ispin][i][j] * Psi(j,r) 282 int i0, iN; 283 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), int(G2D.size(0)), TG.TG_local().size()); 284 ma::product(G2D.sliced(i0, iN), Orbitals, T.sliced(i0, iN)); 285 TG.TG_local().barrier(); 286 287 // G[iw][ispin][r][r'] = sum_ij conj( Psi(i,r) ) * G[iw][ispin][i][j] * Psi(j,r') 288 // = sum_i conj( Psi(i,r) ) T1[iw][ispin][i][r'] = H(Psi) * T1 289 std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Aarray; 290 Aarray.reserve(nw * nsp); 291 std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Barray; 292 Barray.reserve(nw * nsp); 293 std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Carray; 294 Carray.reserve(nw * nsp); 295 for (int iw = 0, p = 0; iw < nw; ++iw) 296 for (int ispin = 0; ispin < nsp; ++ispin, ++p) 297 if (p % TG.TG_local().size() == TG.TG_local().rank()) 298 { 299 Aarray.emplace_back(ma::pointer_dispatch(Orbitals.origin())); 300 Barray.emplace_back(ma::pointer_dispatch(T3D[iw][ispin].origin())); 301 Carray.emplace_back(ma::pointer_dispatch(Gr3D[iw][ispin].origin())); 302 } 303 // careful with fortran ordering 304 gemmBatched('N', 'C', npts, npts, NMO, ComplexType(1.0), Barray.data(), npts, Aarray.data(), npts, 305 ComplexType(0.0), Carray.data(), npts, int(Aarray.size())); 306 TG.TG_local().barrier(); 307 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), int(Gr.num_elements()), TG.TG_local().size()); 308 copy_n(Gr.origin() + i0, (iN - i0), Gr_host.origin() + i0); 309 } 310 TG.TG_local().barrier(); 311 312 // : Z(type,r,r') = < (nup_r OP1 ndn_r) (nup_r' OP2 ndn_r') > 313 // where type= 0: charge-charge --> ++ 314 // 1: spin-spin --> -- 315 // 2: charge-spin --> +- 316 // THIS COULD BE WRITTEN WITH OPENMP TO ENABLE GPU EASILY!!! 317 for (int iw = 0; iw < nw; iw++) 318 { 319 if (TG.TG_local().root()) 320 denom[iw] += Xw[iw]; 321 if (iw % TG.TG_local().size() != TG.TG_local().rank()) 322 continue; 323 if (walker_type == CLOSED) 324 { 325 auto&& Gur(Gr_host[iw][0]); 326 stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts}); 327 stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts}); 328 stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts}); 329 auto X_(2.0 * Xw[iw]); 330 for (int i = 0; i < npts; i++) 331 for (int j = 0; j < npts; j++) 332 { 333 auto g1(Gur[i][i] * Gur[j][j]); 334 auto g2(Gur[i][j] * Gur[j][i]); 335 auto o1(Gur[i][j] * Orrp[j][i]); 336 DMcc[i][j] += X_ * (2.0 * g1 - g2 + o1); 337 DMss[i][j] += X_ * (-g2 + o1); 338 // no cs in RHF 339 } 340 } 341 else if (walker_type == COLLINEAR) 342 { 343 auto&& Gur(Gr_host[iw][0]); 344 auto&& Gdr(Gr_host[iw][1]); 345 stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts}); 346 stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts}); 347 stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts}); 348 auto X_(Xw[iw]); 349 for (int i = 0; i < npts; i++) 350 for (int j = 0; j < npts; j++) 351 { 352 auto guu(Gur[i][i] * Gur[j][j] - Gur[i][j] * Gur[j][i] + Gur[i][j] * Orrp[j][i]); 353 auto gdd(Gdr[i][i] * Gdr[j][j] - Gdr[i][j] * Gdr[j][i] + Gdr[i][j] * Orrp[j][i]); 354 auto gud(Gur[i][i] * Gdr[j][j]); 355 auto gdu(Gdr[i][i] * Gur[j][j]); 356 DMcc[i][j] += X_ * (guu + gud + gdu + gdd); 357 DMss[i][j] += X_ * (guu - gud - gdu + gdd); 358 DMcs[i][j] += X_ * (guu - gud + gdu - gdd); 359 } 360 } 361 else if (walker_type == NONCOLLINEAR) 362 { 363 APP_ABORT(" Noncollinear not implemented \n\n\n"); 364 auto&& Gur(Gr_host[iw][0]); 365 auto&& Gdr(Gr_host[iw][1]); 366 stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts}); 367 stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts}); 368 stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts}); 369 auto X_(Xw[iw]); 370 for (int i = 0; i < npts; i++) 371 for (int j = 0; j < npts; j++) 372 { 373 auto guu(Gur[i][i] * Gur[j][j] - Gur[i][j] * Gur[j][i]); 374 auto gdd(Gdr[i][i] * Gdr[j][j] - Gdr[i][j] * Gdr[j][i]); 375 auto gud(Gur[i][i] * Gdr[j][j] - Gur[i][j] * Gdr[j][i]); 376 auto gdu(Gdr[i][i] * Gur[j][j] - Gdr[i][j] * Gur[j][i]); 377 DMcc[i][j] += X_ * (guu + gud + gdu + gdd); 378 DMss[i][j] += X_ * (guu - gud - gdu + gdd); 379 DMcs[i][j] += X_ * (guu - gud + gdu - gdd); 380 } 381 } 382 } 383 TG.TG_local().barrier(); 384 } 385 386 template<class HostCVec> accumulate_block(int iav,HostCVec && wgt,bool impsamp)387 void accumulate_block(int iav, HostCVec&& wgt, bool impsamp) 388 { 389 int nw(denom.size(0)); 390 TG.TG_local().barrier(); 391 // this is meant to be small, so serializing 392 if (TG.TG_local().root()) 393 { 394 for (int iw = 0; iw < nw; iw++) 395 denom[iw] = wgt[iw] / denom[iw]; 396 stdCVector_ref DMAv1D(to_address(DMAverage[iav].origin()), {3 * dm_size}); 397 stdCMatrix_ref DMWork2D(to_address(DMWork.origin()), {nw, 3 * dm_size}); 398 // DMAverage[iav][t][ij] += sum_iw DMWork[iw][t][ij] * denom[iw] = T( DMWork ) * denom 399 ma::product(ComplexType(1.0, 0.0), ma::T(DMWork2D), denom, ComplexType(1.0, 0.0), DMAv1D); 400 } 401 TG.TG_local().barrier(); 402 } 403 404 template<class HostCVec> print(int iblock,hdf_archive & dump,HostCVec && Wsum)405 void print(int iblock, hdf_archive& dump, HostCVec&& Wsum) 406 { 407 using std::fill_n; 408 const int n_zero = 9; 409 410 if (TG.TG_local().root()) 411 { 412 ma::scal(ComplexType(1.0 / block_size), DMAverage); 413 TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0); 414 if (writer) 415 { 416 dump.push(std::string("OD2RDM")); 417 for (int t = 0; t < 3; ++t) 418 { 419 dump.push(type_id[t]); 420 for (int i = 0; i < nave; ++i) 421 { 422 dump.push(std::string("Average_") + std::to_string(i)); 423 std::string padded_iblock = 424 std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock); 425 stdCVector_ref DMAverage_(to_address(DMAverage[i][t].origin()), {dm_size}); 426 dump.write(DMAverage_, "od2rdm_" + type_id[t] + padded_iblock); 427 dump.write(Wsum[i], "denominator_" + padded_iblock); 428 dump.pop(); 429 } 430 dump.pop(); 431 } 432 dump.pop(); 433 } 434 } 435 TG.TG_local().barrier(); 436 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 437 } 438 439 private: 440 Allocator alloc; 441 442 int block_size; 443 444 int nave; 445 446 int counter; 447 448 TaskGroup_& TG; 449 450 WALKER_TYPES walker_type; 451 452 int dm_size; 453 454 bool writer; 455 456 std::vector<std::string> type_id; 457 458 // unfolding of orbitals to super cell (in KP case) is assumed done 459 // as a preprocessing step 460 CMatrix Orbitals; 461 462 // type:0-cc, 1-ss, 2-cs (c=charge, s=spin) 463 // np = # of points in real space. 464 // DMAverage (nave, type, np*np ) 465 mpi3CTensor DMAverage; 466 // DMWork (nwalk, type, np*np ) 467 mpi3CTensor DMWork; 468 469 mpi3CVector denom; 470 471 mpi3C4Tensor Gr_host; 472 mpi3CMatrix Orrp; 473 }; 474 475 } // namespace afqmc 476 } // namespace qmcplusplus 477 478 #endif 479