1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 //    Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 //    Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15 
16 #ifndef QMCPLUSPLUS_AFQMC_REALSPACE_CORRELATORS_HPP
17 #define QMCPLUSPLUS_AFQMC_REALSPACE_CORRELATORS_HPP
18 
19 #include "AFQMC/config.h"
20 #include <vector>
21 #include <string>
22 #include <iostream>
23 
24 #include "hdf/hdf_multi.h"
25 #include "hdf/hdf_archive.h"
26 #include "OhmmsData/libxmldefs.h"
27 #include "Utilities/Timer.h"
28 
29 #include "AFQMC/Walkers/WalkerSet.hpp"
30 #include "AFQMC/Numerics/detail/utilities.hpp"
31 #include "AFQMC/Numerics/ma_operations.hpp"
32 #include "AFQMC/Numerics/batched_operations.hpp"
33 #include "AFQMC/Memory/buffer_managers.h"
34 
35 namespace qmcplusplus
36 {
37 namespace afqmc
38 {
39 /*
40  * Observable class that calculates the walker averaged on-top pair density
41  * Alloc defines the allocator type used to store the orbital and temporary tensors.
42  * In a device compilation, this would need out-of-card gemm available.
43  */
44 class realspace_correlators : public AFQMCInfo
45 {
46   // allocators
47   using Allocator = localTG_allocator<ComplexType>;
48 
49   // type defs
50   using pointer = device_ptr<ComplexType>;
51 
52   using CVector_ref    = boost::multi::array_ref<ComplexType, 1, pointer>;
53   using CMatrix_ref    = boost::multi::array_ref<ComplexType, 2, pointer>;
54   using CTensor_ref    = boost::multi::array_ref<ComplexType, 3, pointer>;
55   using C4Tensor_ref   = boost::multi::array_ref<ComplexType, 4, pointer>;
56   using CVector        = boost::multi::array<ComplexType, 1, Allocator>;
57   using CMatrix        = boost::multi::array<ComplexType, 2, Allocator>;
58   using stdCVector     = boost::multi::array<ComplexType, 1>;
59   using stdCMatrix     = boost::multi::array<ComplexType, 2>;
60   using stdIVector     = boost::multi::array<int, 1>;
61   using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>;
62   using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>;
63   using mpi3CVector    = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>;
64   using mpi3CMatrix    = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>;
65   using mpi3CTensor    = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>;
66   using mpi3C4Tensor   = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>;
67 
68   using shm_stack_alloc_type = LocalTGBufferManager::template allocator_t<ComplexType>;
69   using StaticMatrix         = boost::multi::static_array<ComplexType, 2, shm_stack_alloc_type>;
70 
71 public:
realspace_correlators(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,int nave_=1,int bsize=1)72   realspace_correlators(afqmc::TaskGroup_& tg_,
73                         AFQMCInfo& info,
74                         xmlNodePtr cur,
75                         WALKER_TYPES wlk,
76                         int nave_ = 1,
77                         int bsize = 1)
78       : AFQMCInfo(info),
79         alloc(make_localTG_allocator<ComplexType>(tg_)),
80         block_size(bsize),
81         nave(nave_),
82         counter(0),
83         TG(tg_),
84         walker_type(wlk),
85         dm_size(0),
86         writer(false),
87         Orbitals({0, 0}, alloc),
88         DMAverage({0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
89         DMWork({0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
90         denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}),
91         Gr_host({0, 0, 0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
92         Orrp({0, 0}, shared_allocator<ComplexType>{TG.TG_local()})
93   {
94     app_log() << "  --  Adding Back Propagated real-space off diagonal 2RDM (realspace_correlators). -- \n ";
95     std::string orb_file("");
96     std::string str("false");
97 
98     if (cur != NULL)
99     {
100       ParameterSet m_param;
101       m_param.add(orb_file, "orbitals");
102       m_param.put(cur);
103     }
104 
105     if (not file_exists(orb_file))
106     {
107       app_error() << " Error: orbitals file does not exist: " << orb_file << std::endl;
108       APP_ABORT("");
109     }
110 
111     stdIVector norbs(iextensions<1u>{0});
112     dm_size     = 0;
113     int npoints = 0;
114 
115     // read orbitals
116     hdf_archive dump;
117     if (TG.Node().root())
118     {
119       if (!dump.open(orb_file, H5F_ACC_RDONLY))
120       {
121         app_error() << " Error opening orbitals file for realspace_correlators estimator. \n";
122         APP_ABORT("");
123       }
124       if (!dump.push("OrbsR", false))
125       {
126         app_error() << " Error in realspace_correlators: Group OrbsR not found." << std::endl;
127         APP_ABORT("");
128       }
129       // read one orbital to check size and later corroborate all orbitals have same size
130       stdCVector orb(iextensions<1u>{1});
131       if (!dump.readEntry(orb, "kp0_b0"))
132       {
133         app_error() << " Error in realspace_correlators: Problems reading orbital: 0  0" << std::endl;
134         APP_ABORT("");
135       }
136       npoints = orb.size(0);
137       if (npoints < 1)
138       {
139         app_error() << " Error in realspace_correlators: npoints < 1. " << std::endl;
140         APP_ABORT("");
141       }
142       TG.Node().broadcast_n(&npoints, 1, 0);
143       if (!dump.readEntry(norbs, "number_of_orbitals"))
144       {
145         app_error() << " Error in realspace_correlators: Problems reading number_of_orbitals. " << std::endl;
146         APP_ABORT("");
147       }
148       int M = 0, nk = norbs.size();
149       for (int i = 0; i < nk; i++)
150         M += norbs[i];
151       if (M != NMO)
152       {
153         app_error() << " Error in realspace_correlators: Inconsistent number of orbitals in file: " << M
154                     << " expected:" << NMO << std::endl;
155         APP_ABORT("");
156       }
157       TG.Node().broadcast_n(&npoints, 1, 0);
158       Orbitals = CMatrix({NMO, npoints}, alloc);
159       // host copy to calculate Orrp
160       stdCMatrix host_orb({NMO, npoints});
161       Orrp = mpi3CMatrix({npoints, npoints}, shared_allocator<ComplexType>{TG.TG_local()});
162       for (int k = 0, kn = 0; k < nk; k++)
163       {
164         for (int i = 0; i < norbs[k]; i++, kn++)
165         {
166           if (!dump.readEntry(orb, "kp" + std::to_string(k) + "_b" + std::to_string(i)))
167           {
168             app_error() << " Error in realspace_correlators: Problems reading orbital: " << k << " " << i << std::endl;
169             APP_ABORT("");
170           }
171           if (orb.size(0) != npoints)
172           {
173             app_error() << " Error in realspace_correlators: Inconsistent orbital size: " << k << " " << i << std::endl;
174             APP_ABORT("");
175           }
176           using std::copy_n;
177           copy_n(orb.origin(), npoints, ma::pointer_dispatch(Orbitals[kn].origin()));
178           copy_n(orb.origin(), npoints, host_orb[kn].origin());
179         }
180       }
181       dump.pop();
182       dump.close();
183 
184       // Or(r,r') = sum_j conj(psi(j,r)) * psi(j,r')
185       ma::product(ma::H(host_orb), host_orb, Orrp);
186 
187       app_log() << " Number of grid points: " << npoints << std::endl;
188     }
189     else
190     {
191       TG.Node().broadcast_n(&npoints, 1, 0);
192       Orbitals = CMatrix({NMO, npoints}, alloc);
193       Orrp     = mpi3CMatrix({npoints, npoints}, shared_allocator<ComplexType>{TG.TG_local()});
194     }
195     dm_size = npoints * npoints;
196     TG.Node().barrier();
197 
198     using std::fill_n;
199     writer = (TG.getGlobalRank() == 0);
200 
201     if (writer)
202     {
203       type_id.reserve(3);
204       type_id.emplace_back("CC");
205       type_id.emplace_back("SS");
206       type_id.emplace_back("CS");
207     }
208 
209     DMAverage = mpi3CTensor({nave, 3, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
210     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
211   }
212 
213   template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3>
accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)214   void accumulate_reference(int iav,
215                             int iref,
216                             MatG&& G,
217                             MatG_host&& G_host,
218                             HostCVec1&& wgt,
219                             HostCVec2&& Xw,
220                             HostCVec3&& ovlp,
221                             bool impsamp)
222   {
223     static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality");
224     static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality");
225     using ma::gemmBatched;
226     using std::copy_n;
227     using std::fill_n;
228     // assumes G[nwalk][spin][M][M]
229     int nw(G.size(0));
230     int npts(Orbitals.size(1));
231     assert(G.size(0) == wgt.size(0));
232     assert(wgt.size(0) == nw);
233     assert(Xw.size(0) == nw);
234     assert(ovlp.size(0) >= nw);
235     assert(G.num_elements() == G_host.num_elements());
236     assert(G.extensions() == G_host.extensions());
237 
238     int nsp;
239     if (walker_type == CLOSED)
240       nsp = 1;
241     else
242       nsp = 2;
243 
244     // check structure dimensions
245     if (iref == 0)
246     {
247       if (denom.size(0) != nw)
248       {
249         denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()});
250       }
251       if (DMWork.size(0) != nw || DMWork.size(1) != 3 || DMWork.size(2) != dm_size)
252       {
253         DMWork = mpi3CTensor({nw, 3, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
254       }
255       if (Gr_host.size(0) != nw || Gr_host.size(1) != nsp || Gr_host.size(2) != npts || Gr_host.size(3) != npts)
256       {
257         Gr_host = mpi3C4Tensor({nw, nsp, npts, npts}, shared_allocator<ComplexType>{TG.TG_local()});
258       }
259       fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0));
260       fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0));
261     }
262     else
263     {
264       if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != 3 || DMWork.size(2) != dm_size ||
265           Gr_host.size(0) != nw || Gr_host.size(1) != nsp || Gr_host.size(2) != npts || Gr_host.size(3) != npts ||
266           DMAverage.size(0) != nave || DMAverage.size(1) != 3 || DMAverage.size(2) != dm_size)
267         APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n");
268     }
269 
270 
271     // calculate green functions in real space and send to host for processing/accumulation
272     // if memory becomes a problem, then batch over walkers
273     {
274       LocalTGBufferManager buffer_manager;
275       StaticMatrix T({nw * nsp * NMO, npts}, buffer_manager.get_generator().template get_allocator<ComplexType>());
276       StaticMatrix Gr({nsp * nw, npts * npts}, buffer_manager.get_generator().template get_allocator<ComplexType>());
277       CTensor_ref Gr3D(make_device_ptr(Gr.origin()), {nw, nsp, npts * npts});
278       CTensor_ref T3D(make_device_ptr(T.origin()), {nw, nsp, NMO * npts});
279       CMatrix_ref G2D(make_device_ptr(G.origin()), {nw * nsp * NMO, NMO});
280 
281       // T1[iw][ispin][i][r] = sum_j G[iw][ispin][i][j] * Psi(j,r)
282       int i0, iN;
283       std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), int(G2D.size(0)), TG.TG_local().size());
284       ma::product(G2D.sliced(i0, iN), Orbitals, T.sliced(i0, iN));
285       TG.TG_local().barrier();
286 
287       // G[iw][ispin][r][r'] = sum_ij conj( Psi(i,r) ) * G[iw][ispin][i][j] * Psi(j,r')
288       //                     = sum_i conj( Psi(i,r) ) T1[iw][ispin][i][r'] = H(Psi) * T1
289       std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Aarray;
290       Aarray.reserve(nw * nsp);
291       std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Barray;
292       Barray.reserve(nw * nsp);
293       std::vector<decltype(ma::pointer_dispatch(Orbitals.origin()))> Carray;
294       Carray.reserve(nw * nsp);
295       for (int iw = 0, p = 0; iw < nw; ++iw)
296         for (int ispin = 0; ispin < nsp; ++ispin, ++p)
297           if (p % TG.TG_local().size() == TG.TG_local().rank())
298           {
299             Aarray.emplace_back(ma::pointer_dispatch(Orbitals.origin()));
300             Barray.emplace_back(ma::pointer_dispatch(T3D[iw][ispin].origin()));
301             Carray.emplace_back(ma::pointer_dispatch(Gr3D[iw][ispin].origin()));
302           }
303       // careful with fortran ordering
304       gemmBatched('N', 'C', npts, npts, NMO, ComplexType(1.0), Barray.data(), npts, Aarray.data(), npts,
305                   ComplexType(0.0), Carray.data(), npts, int(Aarray.size()));
306       TG.TG_local().barrier();
307       std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), int(Gr.num_elements()), TG.TG_local().size());
308       copy_n(Gr.origin() + i0, (iN - i0), Gr_host.origin() + i0);
309     }
310     TG.TG_local().barrier();
311 
312     // : Z(type,r,r') = < (nup_r OP1 ndn_r) (nup_r' OP2 ndn_r') >
313     // where type= 0: charge-charge --> ++
314     //             1: spin-spin     --> --
315     //             2: charge-spin   --> +-
316     // THIS COULD BE WRITTEN WITH OPENMP TO ENABLE GPU EASILY!!!
317     for (int iw = 0; iw < nw; iw++)
318     {
319       if (TG.TG_local().root())
320         denom[iw] += Xw[iw];
321       if (iw % TG.TG_local().size() != TG.TG_local().rank())
322         continue;
323       if (walker_type == CLOSED)
324       {
325         auto&& Gur(Gr_host[iw][0]);
326         stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts});
327         stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts});
328         stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts});
329         auto X_(2.0 * Xw[iw]);
330         for (int i = 0; i < npts; i++)
331           for (int j = 0; j < npts; j++)
332           {
333             auto g1(Gur[i][i] * Gur[j][j]);
334             auto g2(Gur[i][j] * Gur[j][i]);
335             auto o1(Gur[i][j] * Orrp[j][i]);
336             DMcc[i][j] += X_ * (2.0 * g1 - g2 + o1);
337             DMss[i][j] += X_ * (-g2 + o1);
338             // no cs in RHF
339           }
340       }
341       else if (walker_type == COLLINEAR)
342       {
343         auto&& Gur(Gr_host[iw][0]);
344         auto&& Gdr(Gr_host[iw][1]);
345         stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts});
346         stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts});
347         stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts});
348         auto X_(Xw[iw]);
349         for (int i = 0; i < npts; i++)
350           for (int j = 0; j < npts; j++)
351           {
352             auto guu(Gur[i][i] * Gur[j][j] - Gur[i][j] * Gur[j][i] + Gur[i][j] * Orrp[j][i]);
353             auto gdd(Gdr[i][i] * Gdr[j][j] - Gdr[i][j] * Gdr[j][i] + Gdr[i][j] * Orrp[j][i]);
354             auto gud(Gur[i][i] * Gdr[j][j]);
355             auto gdu(Gdr[i][i] * Gur[j][j]);
356             DMcc[i][j] += X_ * (guu + gud + gdu + gdd);
357             DMss[i][j] += X_ * (guu - gud - gdu + gdd);
358             DMcs[i][j] += X_ * (guu - gud + gdu - gdd);
359           }
360       }
361       else if (walker_type == NONCOLLINEAR)
362       {
363         APP_ABORT(" Noncollinear not implemented \n\n\n");
364         auto&& Gur(Gr_host[iw][0]);
365         auto&& Gdr(Gr_host[iw][1]);
366         stdCMatrix_ref DMcc(to_address(DMWork[iw][0].origin()), {npts, npts});
367         stdCMatrix_ref DMss(to_address(DMWork[iw][1].origin()), {npts, npts});
368         stdCMatrix_ref DMcs(to_address(DMWork[iw][2].origin()), {npts, npts});
369         auto X_(Xw[iw]);
370         for (int i = 0; i < npts; i++)
371           for (int j = 0; j < npts; j++)
372           {
373             auto guu(Gur[i][i] * Gur[j][j] - Gur[i][j] * Gur[j][i]);
374             auto gdd(Gdr[i][i] * Gdr[j][j] - Gdr[i][j] * Gdr[j][i]);
375             auto gud(Gur[i][i] * Gdr[j][j] - Gur[i][j] * Gdr[j][i]);
376             auto gdu(Gdr[i][i] * Gur[j][j] - Gdr[i][j] * Gur[j][i]);
377             DMcc[i][j] += X_ * (guu + gud + gdu + gdd);
378             DMss[i][j] += X_ * (guu - gud - gdu + gdd);
379             DMcs[i][j] += X_ * (guu - gud + gdu - gdd);
380           }
381       }
382     }
383     TG.TG_local().barrier();
384   }
385 
386   template<class HostCVec>
accumulate_block(int iav,HostCVec && wgt,bool impsamp)387   void accumulate_block(int iav, HostCVec&& wgt, bool impsamp)
388   {
389     int nw(denom.size(0));
390     TG.TG_local().barrier();
391     // this is meant to be small, so serializing
392     if (TG.TG_local().root())
393     {
394       for (int iw = 0; iw < nw; iw++)
395         denom[iw] = wgt[iw] / denom[iw];
396       stdCVector_ref DMAv1D(to_address(DMAverage[iav].origin()), {3 * dm_size});
397       stdCMatrix_ref DMWork2D(to_address(DMWork.origin()), {nw, 3 * dm_size});
398       // DMAverage[iav][t][ij] += sum_iw DMWork[iw][t][ij] * denom[iw] = T( DMWork ) * denom
399       ma::product(ComplexType(1.0, 0.0), ma::T(DMWork2D), denom, ComplexType(1.0, 0.0), DMAv1D);
400     }
401     TG.TG_local().barrier();
402   }
403 
404   template<class HostCVec>
print(int iblock,hdf_archive & dump,HostCVec && Wsum)405   void print(int iblock, hdf_archive& dump, HostCVec&& Wsum)
406   {
407     using std::fill_n;
408     const int n_zero = 9;
409 
410     if (TG.TG_local().root())
411     {
412       ma::scal(ComplexType(1.0 / block_size), DMAverage);
413       TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0);
414       if (writer)
415       {
416         dump.push(std::string("OD2RDM"));
417         for (int t = 0; t < 3; ++t)
418         {
419           dump.push(type_id[t]);
420           for (int i = 0; i < nave; ++i)
421           {
422             dump.push(std::string("Average_") + std::to_string(i));
423             std::string padded_iblock =
424                 std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock);
425             stdCVector_ref DMAverage_(to_address(DMAverage[i][t].origin()), {dm_size});
426             dump.write(DMAverage_, "od2rdm_" + type_id[t] + padded_iblock);
427             dump.write(Wsum[i], "denominator_" + padded_iblock);
428             dump.pop();
429           }
430           dump.pop();
431         }
432         dump.pop();
433       }
434     }
435     TG.TG_local().barrier();
436     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
437   }
438 
439 private:
440   Allocator alloc;
441 
442   int block_size;
443 
444   int nave;
445 
446   int counter;
447 
448   TaskGroup_& TG;
449 
450   WALKER_TYPES walker_type;
451 
452   int dm_size;
453 
454   bool writer;
455 
456   std::vector<std::string> type_id;
457 
458   // unfolding of orbitals to super cell (in KP case) is assumed done
459   // as a preprocessing step
460   CMatrix Orbitals;
461 
462   // type:0-cc, 1-ss, 2-cs (c=charge, s=spin)
463   // np = # of points in real space.
464   // DMAverage (nave, type, np*np )
465   mpi3CTensor DMAverage;
466   // DMWork (nwalk, type, np*np )
467   mpi3CTensor DMWork;
468 
469   mpi3CVector denom;
470 
471   mpi3C4Tensor Gr_host;
472   mpi3CMatrix Orrp;
473 };
474 
475 } // namespace afqmc
476 } // namespace qmcplusplus
477 
478 #endif
479