1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 //    Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 //    Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15 
16 #ifndef QMCPLUSPLUS_AFQMC_N2R_HPP
17 #define QMCPLUSPLUS_AFQMC_N2R_HPP
18 
19 #include "AFQMC/config.h"
20 #include <vector>
21 #include <string>
22 #include <iostream>
23 
24 #include "hdf/hdf_multi.h"
25 #include "hdf/hdf_archive.h"
26 #include "OhmmsData/libxmldefs.h"
27 #include "Utilities/Timer.h"
28 
29 #include "AFQMC/Walkers/WalkerSet.hpp"
30 #include "AFQMC/Numerics/detail/utilities.hpp"
31 #include "AFQMC/Numerics/ma_operations.hpp"
32 #include "AFQMC/Numerics/batched_operations.hpp"
33 
34 namespace qmcplusplus
35 {
36 namespace afqmc
37 {
38 /*
39  * Observable class that calculates the walker averaged on-top pair density
40  * Alloc defines the allocator type used to store the orbital and temporary tensors.
41  * In a device compilation, this would need out-of-card gemm available.
42  */
43 template<class Alloc>
44 class n2r : public AFQMCInfo
45 {
46   // allocators
47   using Allocator = device_allocator<ComplexType>;
48 
49   // type defs
50   using pointer       = typename Allocator::pointer;
51   using const_pointer = typename Allocator::const_pointer;
52 
53   // allocators
54   using aux_Allocator = Alloc;
55 
56   using aux_pointer       = typename aux_Allocator::pointer;
57   using const_aux_pointer = typename aux_Allocator::const_pointer;
58 
59   using auxCVector     = boost::multi::array<ComplexType, 1, aux_Allocator>;
60   using auxCMatrix     = boost::multi::array<ComplexType, 2, aux_Allocator>;
61   using auxCVector_ref = boost::multi::array_ref<ComplexType, 1, aux_pointer>;
62   using auxCMatrix_ref = boost::multi::array_ref<ComplexType, 2, aux_pointer>;
63 
64   using CVector_ref    = boost::multi::array_ref<ComplexType, 1, pointer>;
65   using CMatrix_ref    = boost::multi::array_ref<ComplexType, 2, pointer>;
66   using CVector        = boost::multi::array<ComplexType, 1, Allocator>;
67   using CMatrix        = boost::multi::array<ComplexType, 2, Allocator>;
68   using stdCVector     = boost::multi::array<ComplexType, 1>;
69   using stdIVector     = boost::multi::array<int, 1>;
70   using stdCVector_ref = boost::multi::array_ref<ComplexType, 1>;
71   using stdCMatrix_ref = boost::multi::array_ref<ComplexType, 2>;
72   using mpi3CVector    = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>;
73   using mpi3CMatrix    = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>;
74   using mpi3CTensor    = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>;
75   using mpi3C4Tensor   = boost::multi::array<ComplexType, 4, shared_allocator<ComplexType>>;
76 
77 public:
n2r(afqmc::TaskGroup_ & tg_,AFQMCInfo & info,xmlNodePtr cur,WALKER_TYPES wlk,bool host_mem,aux_Allocator alloc_,aux_Allocator orb_alloc_,int nave_=1,int bsize=1)78   n2r(afqmc::TaskGroup_& tg_,
79       AFQMCInfo& info,
80       xmlNodePtr cur,
81       WALKER_TYPES wlk,
82       bool host_mem,
83       aux_Allocator alloc_,
84       aux_Allocator orb_alloc_,
85       int nave_ = 1,
86       int bsize = 1)
87       : AFQMCInfo(info),
88         aux_alloc(alloc_),
89         block_size(bsize),
90         nave(nave_),
91         counter(0),
92         TG(tg_),
93         walker_type(wlk),
94         dm_size(0),
95         writer(false),
96         use_host_memory(host_mem),
97         hdf_walker_output(""),
98         Orbitals({0, 0}, orb_alloc_),
99         DMAverage({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
100         DMWork({0, 0}, shared_allocator<ComplexType>{TG.TG_local()}),
101         denom(iextensions<1u>{0}, shared_allocator<ComplexType>{TG.TG_local()}),
102         Buff(iextensions<1u>{0}, alloc_),
103         Buff2(iextensions<1u>{0})
104   {
105     app_log() << "  --  Adding Back Propagated on-top pair density (N2R) estimator. -- \n ";
106     std::string orb_file("");
107     std::string str("false");
108 
109     if (cur != NULL)
110     {
111       ParameterSet m_param;
112       m_param.add(orb_file, "orbitals");
113       m_param.put(cur);
114     }
115 
116     if (not file_exists(orb_file))
117     {
118       app_error() << " Error: orbitals file does not exist: " << orb_file << std::endl;
119       APP_ABORT("");
120     }
121 
122     stdIVector norbs(iextensions<1u>{0});
123     stdIVector grid_dim(iextensions<1u>{3});
124     dm_size = 0;
125 
126     // read orbitals
127     hdf_archive dump;
128     if (TG.Node().root())
129     {
130       if (!dump.open(orb_file, H5F_ACC_RDONLY))
131       {
132         app_error() << " Error opening orbitals file for n2r estimator. \n";
133         APP_ABORT("");
134       }
135       if (!dump.push("OrbsR", false))
136       {
137         app_error() << " Error in n2r: Group OrbsR not found." << std::endl;
138         APP_ABORT("");
139       }
140       if (!dump.readEntry(grid_dim, "grid_dim"))
141       {
142         app_error() << " Error in n2r: Problems reading grid_dim. " << std::endl;
143         APP_ABORT("");
144       }
145       assert(grid_dim.size() == 3);
146       dm_size = grid_dim[0] * grid_dim[1] * grid_dim[2];
147       if (dm_size < 1)
148       {
149         app_error() << " Error in n2r: prod(grid_dim) < 1. " << std::endl;
150         APP_ABORT("");
151       }
152       TG.Node().broadcast_n(&dm_size, 1, 0);
153       if (!dump.readEntry(norbs, "number_of_orbitals"))
154       {
155         app_error() << " Error in n2r: Problems reading number_of_orbitals. " << std::endl;
156         APP_ABORT("");
157       }
158       int M = 0, nk = norbs.size();
159       for (int i = 0; i < nk; i++)
160         M += norbs[i];
161       if (M != NMO)
162       {
163         app_error() << " Error in n2r: Inconsistent number of orbitals in file: " << M << " expected:" << NMO
164                     << std::endl;
165         APP_ABORT("");
166       }
167       Orbitals = auxCMatrix({NMO, dm_size}, orb_alloc_);
168       for (int k = 0, kn = 0; k < nk; k++)
169       {
170         for (int i = 0; i < norbs[k]; i++, kn++)
171         {
172           stdCVector orb(iextensions<1u>{dm_size});
173           if (!dump.readEntry(orb, "kp" + std::to_string(k) + "_b" + std::to_string(i)))
174           {
175             app_error() << " Error in n2r: Problems reading orbital: " << k << " " << i << std::endl;
176             APP_ABORT("");
177           }
178           using std::copy_n;
179           copy_n(orb.origin(), dm_size, ma::pointer_dispatch(Orbitals[kn].origin()));
180         }
181       }
182       dump.pop();
183       dump.close();
184 
185       app_log() << " Number of grid points: " << dm_size << std::endl;
186     }
187     else
188     {
189       TG.Node().broadcast_n(&dm_size, 1, 0);
190       Orbitals = auxCMatrix({NMO, dm_size}, orb_alloc_);
191     }
192     TG.Node().barrier();
193 
194     using std::fill_n;
195     writer = (TG.getGlobalRank() == 0);
196 
197     DMAverage = mpi3CMatrix({nave, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
198     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
199   }
200 
201   template<class MatG, class MatG_host, class HostCVec1, class HostCVec2, class HostCVec3>
accumulate_reference(int iav,int iref,MatG && G,MatG_host && G_host,HostCVec1 && wgt,HostCVec2 && Xw,HostCVec3 && ovlp,bool impsamp)202   void accumulate_reference(int iav,
203                             int iref,
204                             MatG&& G,
205                             MatG_host&& G_host,
206                             HostCVec1&& wgt,
207                             HostCVec2&& Xw,
208                             HostCVec3&& ovlp,
209                             bool impsamp)
210   {
211     static_assert(std::decay<MatG>::type::dimensionality == 4, "Wrong dimensionality");
212     static_assert(std::decay<MatG_host>::type::dimensionality == 4, "Wrong dimensionality");
213     using std::copy_n;
214     using std::fill_n;
215     // assumes G[nwalk][spin][M][M]
216     int nw(G.size(0));
217     assert(G.size(0) == wgt.size(0));
218     assert(wgt.size(0) == nw);
219     assert(Xw.size(0) == nw);
220     assert(ovlp.size(0) >= nw);
221     assert(G.num_elements() == G_host.num_elements());
222     assert(G.extensions() == G_host.extensions());
223 
224     int nsp;
225     if (walker_type == CLOSED)
226       nsp = 1;
227     else
228       nsp = 2;
229 
230     // check structure dimensions
231     if (iref == 0)
232     {
233       if (denom.size(0) != nw)
234       {
235         denom = mpi3CVector(iextensions<1u>{nw}, shared_allocator<ComplexType>{TG.TG_local()});
236       }
237       if (DMWork.size(0) != nw || DMWork.size(1) != dm_size)
238       {
239         DMWork = mpi3CMatrix({nw, dm_size}, shared_allocator<ComplexType>{TG.TG_local()});
240       }
241       fill_n(denom.origin(), denom.num_elements(), ComplexType(0.0, 0.0));
242       fill_n(DMWork.origin(), DMWork.num_elements(), ComplexType(0.0, 0.0));
243     }
244     else
245     {
246       if (denom.size(0) != nw || DMWork.size(0) != nw || DMWork.size(1) != dm_size || DMAverage.size(0) != nave ||
247           DMAverage.size(1) != dm_size)
248         APP_ABORT(" Error: Invalid state in accumulate_reference. \n\n\n");
249     }
250 
251     int i0, iN;
252     std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size());
253 
254     // MAM: Uses out-of-card GEMM on device!!!
255     // T1[i][r] = sum_j G[spin][i][j] * Psi(j,r)
256     // G[spin][r] = sum_ij conj( Psi(i,r) ) * G[spin][i][j] * Psi(j,r) = sum_i conj( Psi(i,r) ) T1[i][r]
257     int N = dm_size * NMO + dm_size;
258     set_buffer(N);
259     auxCMatrix_ref T(Buff.origin(), {NMO, dm_size});
260     auxCVector_ref Gr(Buff.origin() + T.num_elements(), iextensions<1u>{dm_size});
261 
262     int N2 = nsp * (iN - i0);
263     set_buffer2(N2);
264     stdCMatrix_ref Gr_(Buff2.origin(), {nsp, (iN - i0)});
265 
266     // change batched_dot to a ** interface to make it more general and useful
267 
268     for (int iw = 0; iw < nw; iw++)
269     {
270       if (TG.TG_local().root())
271         denom[iw] += Xw[iw];
272       auto&& Gu = G[iw][0];
273       auto&& Orb0N(Orbitals(Orbitals.extension(0), {i0, iN}));
274       auto&& T0N(T(T.extension(0), {i0, iN}));
275       ma::product(Gu, Orb0N, T0N);
276       using ma::batched_dot;
277       batched_dot('H', 'T', (iN - i0), NMO, ComplexType(1.0), ma::pointer_dispatch(Orb0N.origin()), Orb0N.stride(0),
278                   ma::pointer_dispatch(T0N.origin()), T0N.stride(0), ComplexType(0.0),
279                   ma::pointer_dispatch(Gr.origin()) + i0, 1);
280       /*
281       fill_n(Gr.origin(),dm_size,ComplexType(0.0,0.0));
282       for(int i=0; i<NMO; i++) {
283         ComplexType* O_(Orbitals[i].origin());
284         ComplexType* T_(T[i].origin());
285         ComplexType* G_(Gr.origin());
286         for(int j=0; j<dm_size; j++, O_++, T_++, G_++)
287           (*G_) += std::conj(*O_) * (*T_);
288       }
289 */
290       using std::copy_n;
291       copy_n(ma::pointer_dispatch(Gr.origin()) + i0, (iN - i0), Gr_[0].origin());
292 
293       if (walker_type == CLOSED)
294       {
295         auto Gur(Gr_[0].origin());
296         auto DM(to_address(DMWork[iw].origin()) + i0);
297         auto X_(2.0 * Xw[iw]);
298         for (int ir = i0; ir < iN; ir++, Gur++, DM++)
299           (*DM) += X_ * (*Gur) * (*Gur);
300       }
301       else if (walker_type == COLLINEAR)
302       {
303         auto&& Gd = G[iw][1];
304         ma::product(Gd, Orb0N, T0N);
305         batched_dot('H', 'T', (iN - i0), NMO, ComplexType(1.0), ma::pointer_dispatch(Orb0N.origin()), Orb0N.stride(0),
306                     ma::pointer_dispatch(T0N.origin()), T0N.stride(0), ComplexType(0.0),
307                     ma::pointer_dispatch(Gr.origin()) + i0, 1);
308         using std::copy_n;
309         copy_n(ma::pointer_dispatch(Gr.origin()) + i0, (iN - i0), Gr_[1].origin());
310 
311         auto Gur(Gr_[0].origin());
312         auto Gdr(Gr_[1].origin());
313         auto DM(to_address(DMWork[iw].origin()) + i0);
314         auto X_(2.0 * Xw[iw]);
315         for (int ir = i0; ir < iN; ir++, Gur++, DM++)
316           (*DM) += X_ * (*Gur) * (*Gdr);
317       }
318       else if (walker_type == NONCOLLINEAR)
319       {
320         APP_ABORT(" Error: NONCOLLINEAR not yet implemented in n2r. \n");
321       }
322     }
323     TG.TG_local().barrier();
324   }
325 
326   template<class HostCVec>
accumulate_block(int iav,HostCVec && wgt,bool impsamp)327   void accumulate_block(int iav, HostCVec&& wgt, bool impsamp)
328   {
329     int nw(denom.size(0));
330     int i0, iN;
331     std::tie(i0, iN) = FairDivideBoundary(TG.TG_local().rank(), dm_size, TG.TG_local().size());
332     TG.TG_local().barrier();
333     if (TG.TG_local().root())
334       for (int iw = 0; iw < nw; iw++)
335         denom[iw] = wgt[iw] / denom[iw];
336     TG.TG_local().barrier();
337 
338     // DMAverage[iav][ij] += sum_iw DMWork[iw][ij] * denom[iw] = T( DMWork ) * denom
339     ma::product(ComplexType(1.0, 0.0), ma::T(DMWork({0, nw}, {i0, iN})), denom, ComplexType(1.0, 0.0),
340                 DMAverage[iav].sliced(i0, iN));
341     TG.TG_local().barrier();
342   }
343 
344   template<class HostCVec>
print(int iblock,hdf_archive & dump,HostCVec && Wsum)345   void print(int iblock, hdf_archive& dump, HostCVec&& Wsum)
346   {
347     using std::fill_n;
348     const int n_zero = 9;
349 
350     if (TG.TG_local().root())
351     {
352       ma::scal(ComplexType(1.0 / block_size), DMAverage);
353       TG.TG_heads().reduce_in_place_n(to_address(DMAverage.origin()), DMAverage.num_elements(), std::plus<>(), 0);
354       if (writer)
355       {
356         dump.push(std::string("N2R"));
357         for (int i = 0; i < nave; ++i)
358         {
359           dump.push(std::string("Average_") + std::to_string(i));
360           std::string padded_iblock =
361               std::string(n_zero - std::to_string(iblock).length(), '0') + std::to_string(iblock);
362           stdCVector_ref DMAverage_(to_address(DMAverage[i].origin()), {dm_size});
363           dump.write(DMAverage_, "n2r_" + padded_iblock);
364           dump.write(Wsum[i], "denominator_" + padded_iblock);
365           dump.pop();
366         }
367         dump.pop();
368       }
369     }
370     TG.TG_local().barrier();
371     fill_n(DMAverage.origin(), DMAverage.num_elements(), ComplexType(0.0, 0.0));
372   }
373 
374 private:
375   aux_Allocator aux_alloc;
376 
377   int block_size;
378 
379   int nave;
380 
381   int counter;
382 
383   TaskGroup_& TG;
384 
385   WALKER_TYPES walker_type;
386 
387   int dm_size;
388 
389   bool writer;
390 
391   bool use_host_memory;
392 
393   std::string hdf_walker_output;
394 
395   auxCMatrix Orbitals;
396 
397   // DMAverage (nave, spin*spin*x*NMO*(x*NMO-1)/2 ), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR)
398   mpi3CMatrix DMAverage;
399   // DMWork (nwalk, spin*spin*x*NMO*(x*NMO-1)/2 ), x=(1:CLOSED/COLLINEAR, 2:NONCOLLINEAR)
400   mpi3CMatrix DMWork;
401 
402   mpi3CVector denom;
403 
404   // buffer space
405   auxCVector Buff;
406   stdCVector Buff2;
407 
set_buffer(size_t N)408   void set_buffer(size_t N)
409   {
410     if (Buff.num_elements() < N)
411       Buff = auxCVector(iextensions<1u>{N}, aux_alloc);
412     using std::fill_n;
413     fill_n(Buff.origin(), N, ComplexType(0.0));
414   }
set_buffer2(size_t N)415   void set_buffer2(size_t N)
416   {
417     if (Buff2.num_elements() < N)
418       Buff2 = stdCVector(iextensions<1u>{N});
419     using std::fill_n;
420     fill_n(Buff2.origin(), N, ComplexType(0.0));
421   }
422 };
423 
424 } // namespace afqmc
425 } // namespace qmcplusplus
426 
427 #endif
428