1 ////////////////////////////////////////////////////////////////////// 2 // This file is distributed under the University of Illinois/NCSA Open Source 3 // License. See LICENSE file in top directory for details. 4 // 5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. 6 // 7 // File developed by: 8 // Miguel A. Morales, moralessilva2@llnl.gov 9 // Lawrence Livermore National Laboratory 10 // 11 // File created by: 12 // Miguel A. Morales, moralessilva2@llnl.gov 13 // Lawrence Livermore National Laboratory 14 //////////////////////////////////////////////////////////////////////////////// 15 16 #ifndef QMCPLUSPLUS_AFQMC_FULL2RDM_HPP 17 #define QMCPLUSPLUS_AFQMC_FULL2RDM_HPP 18 19 #include "AFQMC/config.h" 20 #include <vector> 21 #include <string> 22 #include <iostream> 23 24 #include "hdf/hdf_multi.h" 25 #include "hdf/hdf_archive.h" 26 #include "OhmmsData/libxmldefs.h" 27 #include "Utilities/Timer.h" 28 29 #include "AFQMC/Walkers/WalkerSet.hpp" 30 #include "AFQMC/Numerics/ma_operations.hpp" 31 #include "AFQMC/Memory/buffer_managers.h" 32 33 namespace qmcplusplus 34 { 35 namespace afqmc 36 { 37 /* 38 * Observable class that calculates the walker averaged 2 RDM. 39 * The resulting RDM will be [3*spin][i][k][j][l] 40 * where x:2 for NONCOLLINEAR and 1 for everything else. 41 * For collinear, the spin ordering is (a,a,a,a), (a,a,b,b), (b,b,b,b) 42 */ 43 class full2rdm : public AFQMCInfo 44 { 45 // allocators 46 using Allocator = device_allocator<ComplexType>; 47 using Allocator_shared = node_allocator<ComplexType>; 48 49 // type defs 50 using pointer = typename Allocator::pointer; 51 using const_pointer = typename Allocator::const_pointer; 52 using pointer_shared = typename Allocator_shared::pointer; 53 using const_pointer_shared = typename Allocator_shared::const_pointer; 54 55 using CVector_ref = boost::multi::array_ref<ComplexType, 1, pointer>; 56 using CMatrix_ref = boost::multi::array_ref<ComplexType, 2, pointer>; 57 using CVector = boost::multi::array<ComplexType, 1, Allocator>; 58 using CMatrix = boost::multi::array<ComplexType, 2, Allocator>; 59 using sharedCMatrix = boost::multi::array<ComplexType, 2, Allocator_shared>; 60 using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>; 61 using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>; 62 using stdCVector = boost::multi::array<ComplexType, 1>; 63 using stdCMatrix = boost::multi::array<ComplexType, 2>; 64 using stdIMatrix = boost::multi::array<int, 2>; 65 using mpi3CVector = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>; 66 using mpi3IMatrix = boost::multi::array<int, 2, shared_allocator<int>>; 67 using mpi3CMatrix = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>; 68 using mpi3CTensor = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>; 69 using mpi3C4Tensor = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>; 70 71 using stack_alloc_type = DeviceBufferManager::template allocator_t<ComplexType>; 72 using StaticVector = boost::multi::static_array<ComplexType, 1, stack_alloc_type>; 73 using StaticMatrix = boost::multi::static_array<ComplexType, 2, stack_alloc_type>; 74 75 public: full2rdm(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,int nave_=1,int bsize=1)76 full2rdm(afqmc::TaskGroup_& tg_, AFQMCInfo& info, xmlNodePtr cur, WALKER_TYPES wlk, int nave_ = 1, int bsize = 1) 77 : AFQMCInfo(info), 78 TG(tg_), 79 walker_type(wlk), 80 writer(false), 81 block_size(bsize), 82 nave(nave_), 83 counter(0), 84 apply_rotation(false), 85 XRot({0, 0}, make_node_allocator<ComplexType>(TG)), 86 denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}), 87 DMAverage({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 88 DMWork({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}) 89 { 90 using std::copy_n; 91 using std::fill_n; 92 93 app_log() << " -- Adding Back Propagated 2RDM (TwoRDM) estimator. -- \n"; 94 95 std::string rot_file(""); 96 std::string path("/"); 97 std::string str("false"); 98 99 if (cur != NULL) 100 { 101 ParameterSet m_param; 102 m_param.add(rot_file, "rotation"); 103 m_param.add(path, "path"); 104 m_param.put(cur); 105 } 106 107 if (rot_file != "") 108 { 109 if (not file_exists(rot_file)) 110 { 111 app_error() << " Error: File with rotation matrix does not exist: " << rot_file << std::endl; 112 APP_ABORT(""); 113 } 114 apply_rotation = true; 115 int dim[2]; 116 117 hdf_archive dump; 118 if (TG.Node().root()) 119 { 120 if (!dump.open(rot_file, H5F_ACC_RDONLY)) 121 APP_ABORT("Error opening orbitals file for n2r estimator.\n"); 122 if (!dump.push(path, false)) 123 APP_ABORT("Error in full2rdm: path not found."); 124 stdCMatrix R; 125 if (!dump.readEntry(R, "RotationMatrix")) 126 APP_ABORT("Error reading RotationMatrix.\n"); 127 if (R.size(1) != NMO) 128 APP_ABORT("Error Wrong dimensions in RotationMatrix.\n"); 129 dim[0] = R.size(0); 130 dim[1] = 0; 131 // conjugate rotation matrix 132 std::transform(R.origin(), R.origin() + R.num_elements(), R.origin(), __anon815ea36c0102(const auto& c) 133 [](const auto& c) { return std::conj(c); }); 134 TG.Node().broadcast_n(dim, 2, 0); 135 XRot = sharedCMatrix({dim[0], NMO}, make_node_allocator<ComplexType>(TG)); 136 copy_n(R.origin(), R.num_elements(), make_device_ptr(XRot.origin())); 137 if (TG.Node().root()) 138 TG.Cores().broadcast_n(to_address(XRot.origin()), XRot.num_elements(), 0); 139 140 dump.pop(); 141 dump.close(); 142 } 143 else 144 { 145 TG.Node().broadcast_n(dim, 2, 0); 146 XRot = sharedCMatrix({dim[0], NMO}, make_node_allocator<ComplexType>(TG)); 147 if (TG.Node().root()) 148 TG.Cores().broadcast_n(to_address(XRot.origin()), XRot.num_elements(), 0); 149 } 150 TG.Node().barrier(); 151 152 dm_size = XRot.size(0) * XRot.size(0) * XRot.size(0) * XRot.size(0); 153 } 154 else 155 { 156 dm_size = NMO * NMO * NMO * NMO; 157 } 158 159 // (a,a,a,a), (a,a,b,b) 160 nspinblocks = 2; 161 if (walker_type == COLLINEAR) 162 { 163 nspinblocks = 3; // (a,a,a,a), (a,a,b,b), (b,b,b,b) 164 } 165 else if (walker_type == NONCOLLINEAR) 166 APP_ABORT(" Error: NONCOLLINEAR not yet implemented. \n\n\n"); 167 168 dm_size *= nspinblocks; 169 170 using std::fill_n; 171 writer = (TG.getGlobalRank() == 0); 172 173 DMAverage = mpi3CMatrix({nave, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 174 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 175 } 176 177 template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3> accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)178 void accumulate_reference(int iav, 179 int iref, 180 MatG&& G, 181 MatG_host&& G_host, 182 HostCVec1&& wgt, 183 HostCVec2&& Xw, 184 HostCVec3&& ovlp, 185 bool impsamp) 186 { 187 static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality"); 188 static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality"); 189 using std::fill_n; 190 // assumes G[nwalk][spin][M][M] 191 int nw(G.size(0)); 192 assert(G.size(0) == wgt.size(0)); 193 assert(wgt.size(0) == nw); 194 assert(Xw.size(0) == nw); 195 assert(ovlp.size(0) >= nw); 196 assert(G.num_elements() == G_host.num_elements()); 197 assert(G.extensions() == G_host.extensions()); 198 199 // check structure dimensions 200 if (iref == 0) 201 { 202 if (denom.size(0) != nw) 203 { 204 denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()}); 205 } 206 if (DMWork.size(0) != nw || DMWork.size(1) != dm_size) 207 { 208 DMWork = mpi3CMatrix({nw, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 209 } 210 fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0)); 211 fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0)); 212 } 213 else 214 { 215 if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != dm_size || DMAverage.size(0) != nave || 216 DMAverage.size(1) != dm_size) 217 APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n"); 218 } 219 220 if (apply_rotation) 221 acc_with_rotation(G, Xw); 222 else 223 acc_no_rotation(G, Xw); 224 } 225 226 227 template<class HostCVec> accumulate_block(int iav,HostCVec && wgt,bool impsamp)228 void accumulate_block(int iav, HostCVec&& wgt, bool impsamp) 229 { 230 int nw(denom.size(0)); 231 int i0, iN; 232 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size()); 233 234 if (TG.TG_local().root()) 235 for (int iw = 0; iw < nw; iw++) 236 denom[iw] = wgt[iw] / denom[iw]; 237 TG.TG_local().barrier(); 238 239 // DMAverage[iav][ij] += sum_iw DMWork[iw][ij] * denom[iw] = T( DMWork ) * denom 240 ma::product(ComplexType(1.0, 0.0), ma::T(DMWork({0, nw}, {i0, iN})), denom, ComplexType(1.0, 0.0), 241 DMAverage[iav].sliced(i0, iN)); 242 TG.TG_local().barrier(); 243 } 244 245 template<class HostCVec> print(int iblock,hdf_archive & dump,HostCVec && Wsum)246 void print(int iblock, hdf_archive& dump, HostCVec&& Wsum) 247 { 248 using std::fill_n; 249 const int n_zero = 9; 250 251 if (TG.TG_local().root()) 252 { 253 ma::scal(ComplexType(1.0 / block_size), DMAverage); 254 TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0); 255 if (writer) 256 { 257 dump.push(std::string("FullTwoRDM")); 258 for (int i = 0; i < nave; ++i) 259 { 260 dump.push(std::string("Average_") + std::to_string(i)); 261 std::string padded_iblock = 262 std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock); 263 stdCVector_ref DMAverage_(to_address(DMAverage[i].origin()), {dm_size}); 264 dump.write(DMAverage_, "two_rdm_" + padded_iblock); 265 dump.write(Wsum[i], "denominator_" + padded_iblock); 266 dump.pop(); 267 } 268 dump.pop(); 269 } 270 } 271 TG.TG_local().barrier(); 272 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 273 } 274 275 private: 276 TaskGroup_& TG; 277 278 WALKER_TYPES walker_type; 279 280 bool writer; 281 282 int block_size; 283 284 int nave; 285 286 int counter; 287 288 int nspinblocks; 289 290 int dm_size; 291 292 bool apply_rotation; 293 294 sharedCMatrix XRot; 295 stdCVector Grot; 296 297 mpi3CVector denom; 298 299 // DMAverage (nave, nspinblocks, x*NMO*x*NMO), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR) 300 mpi3CMatrix DMAverage; 301 // DMWork (nwalk, nspinblocks, x*NMO*x*NMO), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR) 302 mpi3CMatrix DMWork; 303 304 // expects host accesible memory 305 template<class MatG, class CVec> acc_no_rotation(MatG && G,CVec && Xw)306 void acc_no_rotation(MatG&& G, CVec&& Xw) 307 { 308 // doing this 1 walker at a time and not worrying about speed 309 int nw(G.size(0)); 310 311 int i0, iN; 312 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), NMO * NMO, TG.TG_local().size()); 313 int dN = iN - i0; 314 315 size_t M2(NMO * NMO); 316 size_t M4(M2 * M2); 317 size_t N = size_t(dN) * M2; 318 DeviceBufferManager buffer_manager; 319 StaticMatrix R({dN, NMO * NMO}, buffer_manager.get_generator().template get_allocator<ComplexType>()); 320 CMatrix_ref Q(R.origin(), {NMO * NMO, NMO}); 321 CVector_ref R1D(R.origin(), R.num_elements()); 322 CVector_ref Q1D(Q.origin(), Q.num_elements()); 323 324 // put this in shared memory!!! 325 StaticMatrix Gt({NMO, NMO}, buffer_manager.get_generator().template get_allocator<ComplexType>()); 326 CMatrix_ref GtC(Gt.origin(), {NMO * NMO, 1}); 327 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 328 if (Grot.size() < R.num_elements()) 329 Grot = stdCVector(iextensions<1u>(R.num_elements())); 330 #endif 331 332 if (TG.TG_local().size() > 1) 333 APP_ABORT(" ncores > 1 not yet allowed in full2rdm.\n\n\n"); 334 335 // (ikjl) = Gik * Gjl - (same spin) Gil Gjk 336 if (walker_type == COLLINEAR) 337 { 338 for (int iw = 0; iw < nw; iw++) 339 { 340 if (TG.TG_local().root()) 341 denom[iw] += Xw[iw]; 342 343 CMatrix_ref Gup(make_device_ptr(G[iw][0].origin()), {NMO * NMO, 1}); 344 CMatrix_ref Gdn(make_device_ptr(G[iw][1].origin()), {NMO * NMO, 1}); 345 // use ger !!!! 346 347 // (a,a,a,a) 348 ma::product(Gup.sliced(i0, iN), ma::T(Gup), R); 349 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 350 using std::copy_n; 351 copy_n(R.origin(), R.num_elements(), Grot.origin()); 352 ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(size_t(i0) * M2, size_t(iN) * M2)); 353 #else 354 ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(size_t(i0) * M2, size_t(iN) * M2)); 355 #endif 356 TG.TG_local().barrier(); 357 ma::transpose(G[iw][0], Gt); 358 // EXX: (ikjl) -= Gil Gjk = Gt_kj x G[i]l for each i 359 // parallelize this!!! 360 for (int i = 0; i < NMO; ++i) 361 { 362 ma::product(ComplexType(-1.0), GtC, G[iw][0].sliced(i, i + 1), ComplexType(0.0), Q); 363 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 364 using std::copy_n; 365 copy_n(Q.origin(), Q.num_elements(), Grot.origin()); 366 ma::axpy(Xw[iw], Grot.sliced(0, Q.num_elements()), 367 DMWork[iw].sliced(size_t(i * NMO) * M2, size_t((i + 1) * NMO) * M2)); 368 #else 369 ma::axpy(Xw[iw], Q1D, DMWork[iw].sliced(size_t(i * NMO) * M2, size_t((i + 1) * NMO) * M2)); 370 #endif 371 } 372 373 // (a,a,b,b) 374 ma::product(Gup.sliced(i0, iN), ma::T(Gdn), R); 375 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 376 using std::copy_n; 377 copy_n(R.origin(), R.num_elements(), Grot.origin()); 378 ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(M4 + size_t(i0) * M2, M4 + size_t(iN) * M2)); 379 #else 380 ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(M4 + size_t(i0) * M2, M4 + size_t(iN) * M2)); 381 #endif 382 383 // (b,b,b,b) 384 ma::product(Gdn.sliced(i0, iN), ma::T(Gdn), R); 385 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 386 using std::copy_n; 387 copy_n(R.origin(), R.num_elements(), Grot.origin()); 388 ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(2 * M4 + size_t(i0) * M2, 2 * M4 + size_t(iN) * M2)); 389 #else 390 ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(2 * M4 + size_t(i0) * M2, 2 * M4 + size_t(iN) * M2)); 391 #endif 392 TG.TG_local().barrier(); 393 ma::transpose(G[iw][1], Gt); 394 // EXX: (ikjl) -= Gil Gjk = Gt_kj x G[i]l for each i 395 // parallelize this!!! 396 for (int i = 0; i < NMO; ++i) 397 { 398 ma::product(ComplexType(-1.0), GtC, G[iw][1].sliced(i, i + 1), ComplexType(0.0), Q); 399 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) 400 using std::copy_n; 401 copy_n(Q.origin(), Q.num_elements(), Grot.origin()); 402 ma::axpy(Xw[iw], Grot.sliced(0, Q.num_elements()), 403 DMWork[iw].sliced(2 * M4 + size_t(i * NMO) * M2, 2 * M4 + size_t((i + 1) * NMO) * M2)); 404 #else 405 ma::axpy(Xw[iw], Q1D, DMWork[iw].sliced(2 * M4 + size_t(i * NMO) * M2, 2 * M4 + size_t((i + 1) * NMO) * M2)); 406 #endif 407 } 408 } 409 } 410 else 411 { 412 /* 413 int N = dN*NMO*NMO; 414 set_buffer(N); 415 CMatrix_ref R( Buff.origin(), {dN,NMO*NMO}); 416 CVector_ref R1D( Buff.origin(), {dN*NMO*NMO}); 417 #if defined(ENABLE_CUDA) 418 if(Grot.size() < R.num_elements()) 419 Grot = stdCVector(iextensions<1u>(R.num_elements())); 420 #endif 421 422 423 for(int iw=0; iw<nw; iw++) { 424 425 if(TG.TG_local().root()) denom[iw] += Xw[iw]; 426 427 CMatrix_ref Gw( to_address(G[iw].origin()), {G[0].num_elements(),1}); 428 // use ger later 429 ma::product( Gw.sliced(i0,iN), ma::T(Gw), R ); 430 431 #if defined(ENABLE_CUDA) 432 using std::copy_n; 433 copy_n(R.origin(),R.num_elements(),Grot.origin()); 434 ma::axpy( Xw[iw], Grot, DMWork[iw].sliced(i0*NMO*NMO,iN*NMO*NMO) ); 435 #else 436 ma::axpy( Xw[iw], R.sliced(i0,iN), DMWork[iw].sliced(i0,iN) ); 437 #endif 438 } 439 */ 440 } 441 TG.TG_local().barrier(); 442 } 443 444 // G should be device accesible memory 445 // Xw is in host 446 template<class MatG, class CVec> acc_with_rotation(MatG && G,CVec && Xw)447 void acc_with_rotation(MatG&& G, CVec&& Xw) 448 { 449 /* 450 int i0, iN; 451 std::tie(i0,iN) = FairDivideBoundary(TG.TG_local().rank(),int(XRot.size(0)),TG.TG_local().size()); 452 453 // can batch in the future if too slow 454 // Grot = Xc * G * H(Xc) 455 int nX = XRot.size(0); 456 int sz = nX * (NMO + (iN-i0)); 457 int npts = (iN-i0)*nX; 458 set_buffer(sz); 459 CMatrix_ref T1(Buff.origin(),{(iN-i0),NMO}); 460 CMatrix_ref T2(T1.origin()+T1.num_elements(),{(iN-i0),nX}); 461 if(Grot.size() != npts) 462 Grot = stdCVector(iextensions<1u>(npts)); 463 464 // round-robin for now 465 int cnt=0; 466 for(int iw=0; iw<nw; iw++) { 467 if(i0==iN || i0==XRot.size(0)) break; 468 if(TG.TG_local().root()) denom[iw] += Xw[iw]; 469 ma::product(XRot.sliced(i0,iN),G[iw][0],T1); 470 ma::product(T1,ma::H(XRot),T2); 471 copy_n(T2.origin(),T2.num_elements(),Grot.origin()); 472 ma::axpy( Xw[iw], Grot, DMWork[iw].sliced(i0*nX,i0*nX+npts) ); 473 if(walker_type == COLLINEAR) { 474 ma::product(XRot.sliced(i0,iN),G[iw][1],T1); 475 ma::product(T1,ma::H(XRot),T2); 476 copy_n(T2.origin(),T2.num_elements(),Grot.origin()); 477 ma::axpy( Xw[iw], Grot, DMWork[iw].sliced((nX+i0)*nX,(nX+i0)*nX+npts) ); 478 } 479 } 480 TG.TG_local().barrier(); 481 */ 482 } 483 }; 484 485 } // namespace afqmc 486 } // namespace qmcplusplus 487 488 #endif 489