1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 //    Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 //    Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15 
16 #ifndef QMCPLUSPLUS_AFQMC_FULL2RDM_HPP
17 #define QMCPLUSPLUS_AFQMC_FULL2RDM_HPP
18 
19 #include "AFQMC/config.h"
20 #include <vector>
21 #include <string>
22 #include <iostream>
23 
24 #include "hdf/hdf_multi.h"
25 #include "hdf/hdf_archive.h"
26 #include "OhmmsData/libxmldefs.h"
27 #include "Utilities/Timer.h"
28 
29 #include "AFQMC/Walkers/WalkerSet.hpp"
30 #include "AFQMC/Numerics/ma_operations.hpp"
31 #include "AFQMC/Memory/buffer_managers.h"
32 
33 namespace qmcplusplus
34 {
35 namespace afqmc
36 {
37 /*
38  * Observable class that calculates the walker averaged 2 RDM.
39  * The resulting RDM will be [3*spin][i][k][j][l]
40  * where x:2 for NONCOLLINEAR and 1 for everything else.
41  * For collinear, the spin ordering is (a,a,a,a), (a,a,b,b), (b,b,b,b)
42  */
43 class full2rdm : public AFQMCInfo
44 {
45   // allocators
46   using Allocator        = device_allocator<ComplexType>;
47   using Allocator_shared = node_allocator<ComplexType>;
48 
49   // type defs
50   using pointer              = typename Allocator::pointer;
51   using const_pointer        = typename Allocator::const_pointer;
52   using pointer_shared       = typename Allocator_shared::pointer;
53   using const_pointer_shared = typename Allocator_shared::const_pointer;
54 
55   using CVector_ref    = boost::multi::array_ref<ComplexType, 1, pointer>;
56   using CMatrix_ref    = boost::multi::array_ref<ComplexType, 2, pointer>;
57   using CVector        = boost::multi::array<ComplexType, 1, Allocator>;
58   using CMatrix        = boost::multi::array<ComplexType, 2, Allocator>;
59   using sharedCMatrix  = boost::multi::array<ComplexType, 2, Allocator_shared>;
60   using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>;
61   using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>;
62   using stdCVector     = boost::multi::array<ComplexType, 1>;
63   using stdCMatrix     = boost::multi::array<ComplexType, 2>;
64   using stdIMatrix     = boost::multi::array<int, 2>;
65   using mpi3CVector    = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>;
66   using mpi3IMatrix    = boost::multi::array<int, 2, shared_allocator<int>>;
67   using mpi3CMatrix    = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>;
68   using mpi3CTensor    = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>;
69   using mpi3C4Tensor   = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>;
70 
71   using stack_alloc_type = DeviceBufferManager::template allocator_t<ComplexType>;
72   using StaticVector     = boost::multi::static_array<ComplexType, 1, stack_alloc_type>;
73   using StaticMatrix     = boost::multi::static_array<ComplexType, 2, stack_alloc_type>;
74 
75 public:
full2rdm(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,int nave_=1,int bsize=1)76   full2rdm(afqmc::TaskGroup_& tg_, AFQMCInfo& info, xmlNodePtr cur, WALKER_TYPES wlk, int nave_ = 1, int bsize = 1)
77       : AFQMCInfo(info),
78         TG(tg_),
79         walker_type(wlk),
80         writer(false),
81         block_size(bsize),
82         nave(nave_),
83         counter(0),
84         apply_rotation(false),
85         XRot({0, 0}, make_node_allocator<ComplexType>(TG)),
86         denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}),
87         DMAverage({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
88         DMWork({0, 0}, shared_allocator<ComplexType>{TG.TG_local()})
89   {
90     using std::copy_n;
91     using std::fill_n;
92 
93     app_log() << "  --  Adding Back Propagated 2RDM (TwoRDM) estimator. -- \n";
94 
95     std::string rot_file("");
96     std::string path("/");
97     std::string str("false");
98 
99     if (cur != NULL)
100     {
101       ParameterSet m_param;
102       m_param.add(rot_file, "rotation");
103       m_param.add(path, "path");
104       m_param.put(cur);
105     }
106 
107     if (rot_file != "")
108     {
109       if (not file_exists(rot_file))
110       {
111         app_error() << " Error: File with rotation matrix does not exist: " << rot_file << std::endl;
112         APP_ABORT("");
113       }
114       apply_rotation = true;
115       int dim[2];
116 
117       hdf_archive dump;
118       if (TG.Node().root())
119       {
120         if (!dump.open(rot_file, H5F_ACC_RDONLY))
121           APP_ABORT("Error opening orbitals file for n2r estimator.\n");
122         if (!dump.push(path, false))
123           APP_ABORT("Error in full2rdm: path not found.");
124         stdCMatrix R;
125         if (!dump.readEntry(R, "RotationMatrix"))
126           APP_ABORT("Error reading RotationMatrix.\n");
127         if (R.size(1) != NMO)
128           APP_ABORT("Error Wrong dimensions in RotationMatrix.\n");
129         dim[0] = R.size(0);
130         dim[1] = 0;
131         // conjugate rotation matrix
132         std::transform(R.origin(), R.origin() + R.num_elements(), R.origin(),
__anon815ea36c0102(const auto& c) 133                        [](const auto& c) { return std::conj(c); });
134         TG.Node().broadcast_n(dim, 2, 0);
135         XRot = sharedCMatrix({dim[0], NMO}, make_node_allocator<ComplexType>(TG));
136         copy_n(R.origin(), R.num_elements(), make_device_ptr(XRot.origin()));
137         if (TG.Node().root())
138           TG.Cores().broadcast_n(to_address(XRot.origin()), XRot.num_elements(), 0);
139 
140         dump.pop();
141         dump.close();
142       }
143       else
144       {
145         TG.Node().broadcast_n(dim, 2, 0);
146         XRot = sharedCMatrix({dim[0], NMO}, make_node_allocator<ComplexType>(TG));
147         if (TG.Node().root())
148           TG.Cores().broadcast_n(to_address(XRot.origin()), XRot.num_elements(), 0);
149       }
150       TG.Node().barrier();
151 
152       dm_size = XRot.size(0) * XRot.size(0) * XRot.size(0) * XRot.size(0);
153     }
154     else
155     {
156       dm_size = NMO * NMO * NMO * NMO;
157     }
158 
159     // (a,a,a,a), (a,a,b,b)
160     nspinblocks = 2;
161     if (walker_type == COLLINEAR)
162     {
163       nspinblocks = 3; // (a,a,a,a), (a,a,b,b), (b,b,b,b)
164     }
165     else if (walker_type == NONCOLLINEAR)
166       APP_ABORT(" Error: NONCOLLINEAR not yet implemented. \n\n\n");
167 
168     dm_size *= nspinblocks;
169 
170     using std::fill_n;
171     writer = (TG.getGlobalRank() == 0);
172 
173     DMAverage = mpi3CMatrix({nave, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
174     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
175   }
176 
177   template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3>
accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)178   void accumulate_reference(int iav,
179                             int iref,
180                             MatG&& G,
181                             MatG_host&& G_host,
182                             HostCVec1&& wgt,
183                             HostCVec2&& Xw,
184                             HostCVec3&& ovlp,
185                             bool impsamp)
186   {
187     static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality");
188     static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality");
189     using std::fill_n;
190     // assumes G[nwalk][spin][M][M]
191     int nw(G.size(0));
192     assert(G.size(0) == wgt.size(0));
193     assert(wgt.size(0) == nw);
194     assert(Xw.size(0) == nw);
195     assert(ovlp.size(0) >= nw);
196     assert(G.num_elements() == G_host.num_elements());
197     assert(G.extensions() == G_host.extensions());
198 
199     // check structure dimensions
200     if (iref == 0)
201     {
202       if (denom.size(0) != nw)
203       {
204         denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()});
205       }
206       if (DMWork.size(0) != nw || DMWork.size(1) != dm_size)
207       {
208         DMWork = mpi3CMatrix({nw, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
209       }
210       fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0));
211       fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0));
212     }
213     else
214     {
215       if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != dm_size || DMAverage.size(0) != nave ||
216           DMAverage.size(1) != dm_size)
217         APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n");
218     }
219 
220     if (apply_rotation)
221       acc_with_rotation(G, Xw);
222     else
223       acc_no_rotation(G, Xw);
224   }
225 
226 
227   template<class HostCVec>
accumulate_block(int iav,HostCVec && wgt,bool impsamp)228   void accumulate_block(int iav, HostCVec&& wgt, bool impsamp)
229   {
230     int nw(denom.size(0));
231     int i0, iN;
232     std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size());
233 
234     if (TG.TG_local().root())
235       for (int iw = 0; iw < nw; iw++)
236         denom[iw] = wgt[iw] / denom[iw];
237     TG.TG_local().barrier();
238 
239     // DMAverage[iav][ij] += sum_iw DMWork[iw][ij] * denom[iw] = T( DMWork ) * denom
240     ma::product(ComplexType(1.0, 0.0), ma::T(DMWork({0, nw}, {i0, iN})), denom, ComplexType(1.0, 0.0),
241                 DMAverage[iav].sliced(i0, iN));
242     TG.TG_local().barrier();
243   }
244 
245   template<class HostCVec>
print(int iblock,hdf_archive & dump,HostCVec && Wsum)246   void print(int iblock, hdf_archive& dump, HostCVec&& Wsum)
247   {
248     using std::fill_n;
249     const int n_zero = 9;
250 
251     if (TG.TG_local().root())
252     {
253       ma::scal(ComplexType(1.0 / block_size), DMAverage);
254       TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0);
255       if (writer)
256       {
257         dump.push(std::string("FullTwoRDM"));
258         for (int i = 0; i < nave; ++i)
259         {
260           dump.push(std::string("Average_") + std::to_string(i));
261           std::string padded_iblock =
262               std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock);
263           stdCVector_ref DMAverage_(to_address(DMAverage[i].origin()), {dm_size});
264           dump.write(DMAverage_, "two_rdm_" + padded_iblock);
265           dump.write(Wsum[i], "denominator_" + padded_iblock);
266           dump.pop();
267         }
268         dump.pop();
269       }
270     }
271     TG.TG_local().barrier();
272     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
273   }
274 
275 private:
276   TaskGroup_& TG;
277 
278   WALKER_TYPES walker_type;
279 
280   bool writer;
281 
282   int block_size;
283 
284   int nave;
285 
286   int counter;
287 
288   int nspinblocks;
289 
290   int dm_size;
291 
292   bool apply_rotation;
293 
294   sharedCMatrix XRot;
295   stdCVector Grot;
296 
297   mpi3CVector denom;
298 
299   // DMAverage (nave, nspinblocks, x*NMO*x*NMO), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR)
300   mpi3CMatrix DMAverage;
301   // DMWork (nwalk, nspinblocks, x*NMO*x*NMO), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR)
302   mpi3CMatrix DMWork;
303 
304   // expects host accesible memory
305   template<class MatG, class CVec>
acc_no_rotation(MatG && G,CVec && Xw)306   void acc_no_rotation(MatG&& G, CVec&& Xw)
307   {
308     // doing this 1 walker at a time and not worrying about speed
309     int nw(G.size(0));
310 
311     int i0, iN;
312     std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), NMO * NMO, TG.TG_local().size());
313     int dN           = iN - i0;
314 
315     size_t M2(NMO * NMO);
316     size_t M4(M2 * M2);
317     size_t N = size_t(dN) * M2;
318     DeviceBufferManager buffer_manager;
319     StaticMatrix R({dN, NMO * NMO}, buffer_manager.get_generator().template get_allocator<ComplexType>());
320     CMatrix_ref Q(R.origin(), {NMO * NMO, NMO});
321     CVector_ref R1D(R.origin(), R.num_elements());
322     CVector_ref Q1D(Q.origin(), Q.num_elements());
323 
324     // put this in shared memory!!!
325     StaticMatrix Gt({NMO, NMO}, buffer_manager.get_generator().template get_allocator<ComplexType>());
326     CMatrix_ref GtC(Gt.origin(), {NMO * NMO, 1});
327 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
328     if (Grot.size() < R.num_elements())
329       Grot = stdCVector(iextensions<1u>(R.num_elements()));
330 #endif
331 
332     if (TG.TG_local().size() > 1)
333       APP_ABORT(" ncores > 1 not yet allowed in full2rdm.\n\n\n");
334 
335     // (ikjl) = Gik * Gjl - (same spin) Gil Gjk
336     if (walker_type == COLLINEAR)
337     {
338       for (int iw = 0; iw < nw; iw++)
339       {
340         if (TG.TG_local().root())
341           denom[iw] += Xw[iw];
342 
343         CMatrix_ref Gup(make_device_ptr(G[iw][0].origin()), {NMO * NMO, 1});
344         CMatrix_ref Gdn(make_device_ptr(G[iw][1].origin()), {NMO * NMO, 1});
345         // use ger !!!!
346 
347         //  (a,a,a,a)
348         ma::product(Gup.sliced(i0, iN), ma::T(Gup), R);
349 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
350         using std::copy_n;
351         copy_n(R.origin(), R.num_elements(), Grot.origin());
352         ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(size_t(i0) * M2, size_t(iN) * M2));
353 #else
354         ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(size_t(i0) * M2, size_t(iN) * M2));
355 #endif
356         TG.TG_local().barrier();
357         ma::transpose(G[iw][0], Gt);
358         // EXX: (ikjl) -= Gil Gjk = Gt_kj  x G[i]l  for each i
359         // parallelize this!!!
360         for (int i = 0; i < NMO; ++i)
361         {
362           ma::product(ComplexType(-1.0), GtC, G[iw][0].sliced(i, i + 1), ComplexType(0.0), Q);
363 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
364           using std::copy_n;
365           copy_n(Q.origin(), Q.num_elements(), Grot.origin());
366           ma::axpy(Xw[iw], Grot.sliced(0, Q.num_elements()),
367                    DMWork[iw].sliced(size_t(i * NMO) * M2, size_t((i + 1) * NMO) * M2));
368 #else
369           ma::axpy(Xw[iw], Q1D, DMWork[iw].sliced(size_t(i * NMO) * M2, size_t((i + 1) * NMO) * M2));
370 #endif
371         }
372 
373         //  (a,a,b,b)
374         ma::product(Gup.sliced(i0, iN), ma::T(Gdn), R);
375 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
376         using std::copy_n;
377         copy_n(R.origin(), R.num_elements(), Grot.origin());
378         ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(M4 + size_t(i0) * M2, M4 + size_t(iN) * M2));
379 #else
380         ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(M4 + size_t(i0) * M2, M4 + size_t(iN) * M2));
381 #endif
382 
383         //  (b,b,b,b)
384         ma::product(Gdn.sliced(i0, iN), ma::T(Gdn), R);
385 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
386         using std::copy_n;
387         copy_n(R.origin(), R.num_elements(), Grot.origin());
388         ma::axpy(Xw[iw], Grot, DMWork[iw].sliced(2 * M4 + size_t(i0) * M2, 2 * M4 + size_t(iN) * M2));
389 #else
390         ma::axpy(Xw[iw], R1D, DMWork[iw].sliced(2 * M4 + size_t(i0) * M2, 2 * M4 + size_t(iN) * M2));
391 #endif
392         TG.TG_local().barrier();
393         ma::transpose(G[iw][1], Gt);
394         // EXX: (ikjl) -= Gil Gjk = Gt_kj  x G[i]l  for each i
395         // parallelize this!!!
396         for (int i = 0; i < NMO; ++i)
397         {
398           ma::product(ComplexType(-1.0), GtC, G[iw][1].sliced(i, i + 1), ComplexType(0.0), Q);
399 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
400           using std::copy_n;
401           copy_n(Q.origin(), Q.num_elements(), Grot.origin());
402           ma::axpy(Xw[iw], Grot.sliced(0, Q.num_elements()),
403                    DMWork[iw].sliced(2 * M4 + size_t(i * NMO) * M2, 2 * M4 + size_t((i + 1) * NMO) * M2));
404 #else
405           ma::axpy(Xw[iw], Q1D, DMWork[iw].sliced(2 * M4 + size_t(i * NMO) * M2, 2 * M4 + size_t((i + 1) * NMO) * M2));
406 #endif
407         }
408       }
409     }
410     else
411     {
412       /*
413       int N = dN*NMO*NMO;
414       set_buffer(N);
415       CMatrix_ref R( Buff.origin(), {dN,NMO*NMO});
416       CVector_ref R1D( Buff.origin(), {dN*NMO*NMO});
417 #if defined(ENABLE_CUDA)
418       if(Grot.size() < R.num_elements())
419         Grot = stdCVector(iextensions<1u>(R.num_elements()));
420 #endif
421 
422 
423       for(int iw=0; iw<nw; iw++) {
424 
425         if(TG.TG_local().root()) denom[iw] += Xw[iw];
426 
427         CMatrix_ref Gw( to_address(G[iw].origin()), {G[0].num_elements(),1});
428         // use ger later
429         ma::product( Gw.sliced(i0,iN), ma::T(Gw), R );
430 
431 #if defined(ENABLE_CUDA)
432         using std::copy_n;
433         copy_n(R.origin(),R.num_elements(),Grot.origin());
434         ma::axpy( Xw[iw], Grot, DMWork[iw].sliced(i0*NMO*NMO,iN*NMO*NMO) );
435 #else
436         ma::axpy( Xw[iw], R.sliced(i0,iN), DMWork[iw].sliced(i0,iN) );
437 #endif
438       }
439 */
440     }
441     TG.TG_local().barrier();
442   }
443 
444   // G should be device accesible memory
445   // Xw is in host
446   template<class MatG, class CVec>
acc_with_rotation(MatG && G,CVec && Xw)447   void acc_with_rotation(MatG&& G, CVec&& Xw)
448   {
449     /*
450     int i0, iN;
451     std::tie(i0,iN) = FairDivideBoundary(TG.TG_local().rank(),int(XRot.size(0)),TG.TG_local().size());
452 
453     // can batch in the future if too slow
454     // Grot = Xc * G * H(Xc)
455     int nX = XRot.size(0);
456     int sz = nX * (NMO + (iN-i0));
457     int npts = (iN-i0)*nX;
458     set_buffer(sz);
459     CMatrix_ref T1(Buff.origin(),{(iN-i0),NMO});
460     CMatrix_ref T2(T1.origin()+T1.num_elements(),{(iN-i0),nX});
461     if(Grot.size() != npts)
462       Grot = stdCVector(iextensions<1u>(npts));
463 
464     // round-robin for now
465     int cnt=0;
466     for(int iw=0; iw<nw; iw++) {
467       if(i0==iN || i0==XRot.size(0)) break;
468       if(TG.TG_local().root()) denom[iw] += Xw[iw];
469       ma::product(XRot.sliced(i0,iN),G[iw][0],T1);
470       ma::product(T1,ma::H(XRot),T2);
471       copy_n(T2.origin(),T2.num_elements(),Grot.origin());
472       ma::axpy( Xw[iw], Grot, DMWork[iw].sliced(i0*nX,i0*nX+npts) );
473       if(walker_type == COLLINEAR) {
474         ma::product(XRot.sliced(i0,iN),G[iw][1],T1);
475         ma::product(T1,ma::H(XRot),T2);
476         copy_n(T2.origin(),T2.num_elements(),Grot.origin());
477         ma::axpy( Xw[iw], Grot, DMWork[iw].sliced((nX+i0)*nX,(nX+i0)*nX+npts) );
478       }
479     }
480     TG.TG_local().barrier();
481 */
482   }
483 };
484 
485 } // namespace afqmc
486 } // namespace qmcplusplus
487 
488 #endif
489