1 ////////////////////////////////////////////////////////////////////// 2 // This file is distributed under the University of Illinois/NCSA Open Source 3 // License. See LICENSE file in top directory for details. 4 // 5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. 6 // 7 // File developed by: 8 // Miguel A. Morales, moralessilva2@llnl.gov 9 // Lawrence Livermore National Laboratory 10 // 11 // File created by: 12 // Miguel A. Morales, moralessilva2@llnl.gov 13 // Lawrence Livermore National Laboratory 14 //////////////////////////////////////////////////////////////////////////////// 15 16 #ifndef QMCPLUSPLUS_AFQMC_N2R_HPP 17 #define QMCPLUSPLUS_AFQMC_N2R_HPP 18 19 #include "AFQMC/config.h" 20 #include <vector> 21 #include <string> 22 #include <iostream> 23 24 #include "hdf/hdf_multi.h" 25 #include "hdf/hdf_archive.h" 26 #include "OhmmsData/libxmldefs.h" 27 #include "Utilities/Timer.h" 28 29 #include "AFQMC/Walkers/WalkerSet.hpp" 30 #include "AFQMC/Numerics/detail/utilities.hpp" 31 #include "AFQMC/Numerics/ma_operations.hpp" 32 #include "AFQMC/Numerics/batched_operations.hpp" 33 34 namespace qmcplusplus 35 { 36 namespace afqmc 37 { 38 /* 39 * Observable class that calculates the walker averaged on-top pair density 40 * Alloc defines the allocator type used to store the orbital and temporary tensors. 41 * In a device compilation, this would need out-of-card gemm available. 42 */ 43 template<class Alloc> 44 class n2r : public AFQMCInfo 45 { 46 // allocators 47 using Allocator = device_allocator<ComplexType>; 48 49 // type defs 50 using pointer = typename Allocator::pointer; 51 using const_pointer = typename Allocator::const_pointer; 52 53 // allocators 54 using aux_Allocator = Alloc; 55 56 using aux_pointer = typename aux_Allocator::pointer; 57 using const_aux_pointer = typename aux_Allocator::const_pointer; 58 59 using auxCVector = boost::multi::array<ComplexType, 1, aux_Allocator>; 60 using auxCMatrix = boost::multi::array<ComplexType, 2, aux_Allocator>; 61 using auxCVector_ref = boost::multi::array_ref<ComplexType, 1, aux_pointer>; 62 using auxCMatrix_ref = boost::multi::array_ref<ComplexType, 2, aux_pointer>; 63 64 using CVector_ref = boost::multi::array_ref<ComplexType, 1, pointer>; 65 using CMatrix_ref = boost::multi::array_ref<ComplexType, 2, pointer>; 66 using CVector = boost::multi::array<ComplexType, 1, Allocator>; 67 using CMatrix = boost::multi::array<ComplexType, 2, Allocator>; 68 using stdCVector = boost::multi::array<ComplexType, 1>; 69 using stdIVector = boost::multi::array<int, 1>; 70 using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>; 71 using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>; 72 using mpi3CVector = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>; 73 using mpi3CMatrix = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>; 74 using mpi3CTensor = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>; 75 using mpi3C4Tensor = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>; 76 77 public: n2r(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,bool host_mem,aux_Allocator alloc_,aux_Allocator orb_alloc_,int nave_=1,int bsize=1)78 n2r(afqmc::TaskGroup_& tg_, 79 AFQMCInfo& info, 80 xmlNodePtr cur, 81 WALKER_TYPES wlk, 82 bool host_mem, 83 aux_Allocator alloc_, 84 aux_Allocator orb_alloc_, 85 int nave_ = 1, 86 int bsize = 1) 87 : AFQMCInfo(info), 88 aux_alloc(alloc_), 89 block_size(bsize), 90 nave(nave_), 91 counter(0), 92 TG(tg_), 93 walker_type(wlk), 94 dm_size(0), 95 writer(false), 96 use_host_memory(host_mem), 97 hdf_walker_output(""), 98 Orbitals({0, 0}, orb_alloc_), 99 DMAverage({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 100 DMWork({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}), 101 denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}), 102 Buff(iextensions<1u>{0}, alloc_), 103 Buff2(iextensions<1u>{0}) 104 { 105 app_log() << " -- Adding Back Propagated on-top pair density (N2R) estimator. -- \n "; 106 std::string orb_file(""); 107 std::string str("false"); 108 109 if (cur != NULL) 110 { 111 ParameterSet m_param; 112 m_param.add(orb_file, "orbitals"); 113 m_param.put(cur); 114 } 115 116 if (not file_exists(orb_file)) 117 { 118 app_error() << " Error: orbitals file does not exist: " << orb_file << std::endl; 119 APP_ABORT(""); 120 } 121 122 stdIVector norbs(iextensions<1u>{0}); 123 stdIVector grid_dim(iextensions<1u>{3}); 124 dm_size = 0; 125 126 // read orbitals 127 hdf_archive dump; 128 if (TG.Node().root()) 129 { 130 if (!dump.open(orb_file, H5F_ACC_RDONLY)) 131 { 132 app_error() << " Error opening orbitals file for n2r estimator. \n"; 133 APP_ABORT(""); 134 } 135 if (!dump.push("OrbsR", false)) 136 { 137 app_error() << " Error in n2r: Group OrbsR not found." << std::endl; 138 APP_ABORT(""); 139 } 140 if (!dump.readEntry(grid_dim, "grid_dim")) 141 { 142 app_error() << " Error in n2r: Problems reading grid_dim. " << std::endl; 143 APP_ABORT(""); 144 } 145 assert(grid_dim.size() == 3); 146 dm_size = grid_dim[0] * grid_dim[1] * grid_dim[2]; 147 if (dm_size < 1) 148 { 149 app_error() << " Error in n2r: prod(grid_dim) < 1. " << std::endl; 150 APP_ABORT(""); 151 } 152 TG.Node().broadcast_n(&dm_size, 1, 0); 153 if (!dump.readEntry(norbs, "number_of_orbitals")) 154 { 155 app_error() << " Error in n2r: Problems reading number_of_orbitals. " << std::endl; 156 APP_ABORT(""); 157 } 158 int M = 0, nk = norbs.size(); 159 for (int i = 0; i < nk; i++) 160 M += norbs[i]; 161 if (M != NMO) 162 { 163 app_error() << " Error in n2r: Inconsistent number of orbitals in file: " << M << " expected:" << NMO 164 << std::endl; 165 APP_ABORT(""); 166 } 167 Orbitals = auxCMatrix({NMO, dm_size}, orb_alloc_); 168 for (int k = 0, kn = 0; k < nk; k++) 169 { 170 for (int i = 0; i < norbs[k]; i++, kn++) 171 { 172 stdCVector orb(iextensions<1u>{dm_size}); 173 if (!dump.readEntry(orb, "kp" + std::to_string(k) + "_b" + std::to_string(i))) 174 { 175 app_error() << " Error in n2r: Problems reading orbital: " << k << " " << i << std::endl; 176 APP_ABORT(""); 177 } 178 using std::copy_n; 179 copy_n(orb.origin(), dm_size, ma::pointer_dispatch(Orbitals[kn].origin())); 180 } 181 } 182 dump.pop(); 183 dump.close(); 184 185 app_log() << " Number of grid points: " << dm_size << std::endl; 186 } 187 else 188 { 189 TG.Node().broadcast_n(&dm_size, 1, 0); 190 Orbitals = auxCMatrix({NMO, dm_size}, orb_alloc_); 191 } 192 TG.Node().barrier(); 193 194 using std::fill_n; 195 writer = (TG.getGlobalRank() == 0); 196 197 DMAverage = mpi3CMatrix({nave, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 198 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 199 } 200 201 template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3> accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)202 void accumulate_reference(int iav, 203 int iref, 204 MatG&& G, 205 MatG_host&& G_host, 206 HostCVec1&& wgt, 207 HostCVec2&& Xw, 208 HostCVec3&& ovlp, 209 bool impsamp) 210 { 211 static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality"); 212 static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality"); 213 using std::copy_n; 214 using std::fill_n; 215 // assumes G[nwalk][spin][M][M] 216 int nw(G.size(0)); 217 assert(G.size(0) == wgt.size(0)); 218 assert(wgt.size(0) == nw); 219 assert(Xw.size(0) == nw); 220 assert(ovlp.size(0) >= nw); 221 assert(G.num_elements() == G_host.num_elements()); 222 assert(G.extensions() == G_host.extensions()); 223 224 int nsp; 225 if (walker_type == CLOSED) 226 nsp = 1; 227 else 228 nsp = 2; 229 230 // check structure dimensions 231 if (iref == 0) 232 { 233 if (denom.size(0) != nw) 234 { 235 denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()}); 236 } 237 if (DMWork.size(0) != nw || DMWork.size(1) != dm_size) 238 { 239 DMWork = mpi3CMatrix({nw, dm_size}, shared_allocator<ComplexType>{TG.TG_local()}); 240 } 241 fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0)); 242 fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0)); 243 } 244 else 245 { 246 if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != dm_size || DMAverage.size(0) != nave || 247 DMAverage.size(1) != dm_size) 248 APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n"); 249 } 250 251 int i0, iN; 252 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size()); 253 254 // MAM: Uses out-of-card GEMM on device!!! 255 // T1[i][r] = sum_j G[spin][i][j] * Psi(j,r) 256 // G[spin][r] = sum_ij conj( Psi(i,r) ) * G[spin][i][j] * Psi(j,r) = sum_i conj( Psi(i,r) ) T1[i][r] 257 int N = dm_size * NMO + dm_size; 258 set_buffer(N); 259 auxCMatrix_ref T(Buff.origin(), {NMO, dm_size}); 260 auxCVector_ref Gr(Buff.origin() + T.num_elements(), iextensions<1u>{dm_size}); 261 262 int N2 = nsp * (iN - i0); 263 set_buffer2(N2); 264 stdCMatrix_ref Gr_(Buff2.origin(), {nsp, (iN - i0)}); 265 266 // change batched_dot to a ** interface to make it more general and useful 267 268 for (int iw = 0; iw < nw; iw++) 269 { 270 if (TG.TG_local().root()) 271 denom[iw] += Xw[iw]; 272 auto&& Gu = G[iw][0]; 273 auto&& Orb0N(Orbitals(Orbitals.extension(0), {i0, iN})); 274 auto&& T0N(T(T.extension(0), {i0, iN})); 275 ma::product(Gu, Orb0N, T0N); 276 using ma::batched_dot; 277 batched_dot('H', 'T', (iN - i0), NMO, ComplexType(1.0), ma::pointer_dispatch(Orb0N.origin()), Orb0N.stride(0), 278 ma::pointer_dispatch(T0N.origin()), T0N.stride(0), ComplexType(0.0), 279 ma::pointer_dispatch(Gr.origin()) + i0, 1); 280 /* 281 fill_n(Gr.origin(),dm_size,ComplexType(0.0,0.0)); 282 for(int i=0; i<NMO; i++) { 283 ComplexType* O_(Orbitals[i].origin()); 284 ComplexType* T_(T[i].origin()); 285 ComplexType* G_(Gr.origin()); 286 for(int j=0; j<dm_size; j++, O_++, T_++, G_++) 287 (*G_) += std::conj(*O_) * (*T_); 288 } 289 */ 290 using std::copy_n; 291 copy_n(ma::pointer_dispatch(Gr.origin()) + i0, (iN - i0), Gr_[0].origin()); 292 293 if (walker_type == CLOSED) 294 { 295 auto Gur(Gr_[0].origin()); 296 auto DM(to_address(DMWork[iw].origin()) + i0); 297 auto X_(2.0 * Xw[iw]); 298 for (int ir = i0; ir < iN; ir++, Gur++, DM++) 299 (*DM) += X_ * (*Gur) * (*Gur); 300 } 301 else if (walker_type == COLLINEAR) 302 { 303 auto&& Gd = G[iw][1]; 304 ma::product(Gd, Orb0N, T0N); 305 batched_dot('H', 'T', (iN - i0), NMO, ComplexType(1.0), ma::pointer_dispatch(Orb0N.origin()), Orb0N.stride(0), 306 ma::pointer_dispatch(T0N.origin()), T0N.stride(0), ComplexType(0.0), 307 ma::pointer_dispatch(Gr.origin()) + i0, 1); 308 using std::copy_n; 309 copy_n(ma::pointer_dispatch(Gr.origin()) + i0, (iN - i0), Gr_[1].origin()); 310 311 auto Gur(Gr_[0].origin()); 312 auto Gdr(Gr_[1].origin()); 313 auto DM(to_address(DMWork[iw].origin()) + i0); 314 auto X_(2.0 * Xw[iw]); 315 for (int ir = i0; ir < iN; ir++, Gur++, DM++) 316 (*DM) += X_ * (*Gur) * (*Gdr); 317 } 318 else if (walker_type == NONCOLLINEAR) 319 { 320 APP_ABORT(" Error: NONCOLLINEAR not yet implemented in n2r. \n"); 321 } 322 } 323 TG.TG_local().barrier(); 324 } 325 326 template<class HostCVec> accumulate_block(int iav,HostCVec && wgt,bool impsamp)327 void accumulate_block(int iav, HostCVec&& wgt, bool impsamp) 328 { 329 int nw(denom.size(0)); 330 int i0, iN; 331 std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size()); 332 TG.TG_local().barrier(); 333 if (TG.TG_local().root()) 334 for (int iw = 0; iw < nw; iw++) 335 denom[iw] = wgt[iw] / denom[iw]; 336 TG.TG_local().barrier(); 337 338 // DMAverage[iav][ij] += sum_iw DMWork[iw][ij] * denom[iw] = T( DMWork ) * denom 339 ma::product(ComplexType(1.0, 0.0), ma::T(DMWork({0, nw}, {i0, iN})), denom, ComplexType(1.0, 0.0), 340 DMAverage[iav].sliced(i0, iN)); 341 TG.TG_local().barrier(); 342 } 343 344 template<class HostCVec> print(int iblock,hdf_archive & dump,HostCVec && Wsum)345 void print(int iblock, hdf_archive& dump, HostCVec&& Wsum) 346 { 347 using std::fill_n; 348 const int n_zero = 9; 349 350 if (TG.TG_local().root()) 351 { 352 ma::scal(ComplexType(1.0 / block_size), DMAverage); 353 TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0); 354 if (writer) 355 { 356 dump.push(std::string("N2R")); 357 for (int i = 0; i < nave; ++i) 358 { 359 dump.push(std::string("Average_") + std::to_string(i)); 360 std::string padded_iblock = 361 std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock); 362 stdCVector_ref DMAverage_(to_address(DMAverage[i].origin()), {dm_size}); 363 dump.write(DMAverage_, "n2r_" + padded_iblock); 364 dump.write(Wsum[i], "denominator_" + padded_iblock); 365 dump.pop(); 366 } 367 dump.pop(); 368 } 369 } 370 TG.TG_local().barrier(); 371 fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0)); 372 } 373 374 private: 375 aux_Allocator aux_alloc; 376 377 int block_size; 378 379 int nave; 380 381 int counter; 382 383 TaskGroup_& TG; 384 385 WALKER_TYPES walker_type; 386 387 int dm_size; 388 389 bool writer; 390 391 bool use_host_memory; 392 393 std::string hdf_walker_output; 394 395 auxCMatrix Orbitals; 396 397 // DMAverage (nave, spin*spin*x*NMO*(x*NMO-1)/2 ), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR) 398 mpi3CMatrix DMAverage; 399 // DMWork (nwalk, spin*spin*x*NMO*(x*NMO-1)/2 ), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR) 400 mpi3CMatrix DMWork; 401 402 mpi3CVector denom; 403 404 // buffer space 405 auxCVector Buff; 406 stdCVector Buff2; 407 set_buffer(size_t N)408 void set_buffer(size_t N) 409 { 410 if (Buff.num_elements() < N) 411 Buff = auxCVector(iextensions<1u>{N}, aux_alloc); 412 using std::fill_n; 413 fill_n(Buff.origin(), N, ComplexType(0.0)); 414 } set_buffer2(size_t N)415 void set_buffer2(size_t N) 416 { 417 if (Buff2.num_elements() < N) 418 Buff2 = stdCVector(iextensions<1u>{N}); 419 using std::fill_n; 420 fill_n(Buff2.origin(), N, ComplexType(0.0)); 421 } 422 }; 423 424 } // namespace afqmc 425 } // namespace qmcplusplus 426 427 #endif 428