1//////////////////////////////////////////////////////////////////////
2// This file is distributed under the University of Illinois/NCSA Open Source
3// License.  See LICENSE file in top directory for details.
4//
5// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6//
7// File developed by:
8// Miguel A. Morales, moralessilva2@llnl.gov
9//    Lawrence Livermore National Laboratory
10//
11// File created by:
12// Miguel A. Morales, moralessilva2@llnl.gov
13//    Lawrence Livermore National Laboratory
14////////////////////////////////////////////////////////////////////////////////
15
16#include <vector>
17#include <map>
18#include <string>
19#include <iostream>
20#include <tuple>
21#include <mutex>
22
23#include "AFQMC/config.h"
24#include "AFQMC/Numerics/csr_blas.hpp"
25#include "AFQMC/Wavefunctions/PHMSD.hpp"
26
27//#include "AFQMC/Wavefunctions/PHMSD.h"
28
29namespace qmcplusplus
30{
31namespace afqmc
32{
33/*
34   * Calculates the local energy and overlaps of all the walkers in the set and
35   * returns them in the appropriate data structures
36  */
37template<class WlkSet, class Mat, class TVec>
38void PHMSD::Energy_shared(const WlkSet& wset, Mat&& E, TVec&& Ov)
39{
40  using ma::conj;
41  using std::get;
42  int nspins  = 2; //(walker_type==COLLINEAR?2:1);
43  size_t nkev = HamOp.number_of_ke_vectors();
44  assert(E.dimensionality == 2);
45  assert(Ov.dimensionality == 1);
46  assert(E.size(0) == wset.size());
47  assert(Ov.size(0) == wset.size());
48  assert(E.size(1) == 3);
49
50  ComplexType zero(0.0);
51  auto Gsize = dm_size(false);
52  auto nwalk = wset.size();
53  double LogOverlapFactor(wset.getLogOverlapFactor());
54  // resize shm structures if needed
55  if (Ovmsd.shape() != std::make_tuple(long(nspins), long(maxn_unique_confg), long(nwalk)))
56    Ovmsd.reextent({nspins, maxn_unique_confg, nwalk});
57  if (Emsd.shape() != std::make_tuple(long(nspins), long(maxn_unique_confg), long(nwalk), 3l))
58    Emsd.reextent({nspins, maxn_unique_confg, nwalk, 3});
59  // resize Alpha here, if beta is needed resize below
60  if (GrefA.shape() != std::make_tuple(nwalk, dm_dims(false, Alpha).first, dm_dims(false, Alpha).second))
61    GrefA.reextent({nwalk, dm_dims(false, Alpha).first, dm_dims(false, Alpha).second});
62  if (wgt.size() != nwalk)
63    wgt.reextent(iextensions<1u>{nwalk});
64  if (opSpinEJ.size() != nwalk)
65    opSpinEJ.reextent(iextensions<1u>{nwalk});
66  if (localGbuff.size() < 2 * Gsize)
67    localGbuff.reextent(iextensions<1u>{2 * Gsize});
68  if (eloc2.size(0) != nwalk || eloc2.size(1) != 3)
69    eloc2.reextent({nwalk, 3});
70
71  std::fill_n(Ov.origin(), nwalk, zero);
72  std::fill_n(E.origin(), 3 * nwalk, zero);
73  std::fill_n(opSpinEJ.origin(), nwalk, ComplexType(0.0));
74
75  // dummy: ugly but not sure how to do it better
76  SPCMatrix* dummyMatPtr(nullptr);
77
78  auto refc = abij.reference_configuration();
79  std::vector<int> confg(NAEA);
80  auto confgs = abij.configurations_begin();
81
82  if (walker_type != COLLINEAR)
83    APP_ABORT("Error: Finish implementation of PHMSD for CLOSED/NONCOLLINEAR walkers.\n");
84
85  // FIX FIX FIX: Incorrect if walker_type==CLOSED
86  // 1. calculate eneries for unique determinants
87  //    - Emsd[spin][nd_unique][iw][{0:E1, 1:EXX, 2:--}]
88  //    - Ovmsd[spin][nd_unique][iw]
89  if (fast_ph_energy)
90  {
91    // assume [nwalk][ik] for now. If needed, generalize later
92    if (GrefB.shape() != std::make_tuple(nwalk, dm_dims(false, Beta).first, dm_dims(false, Beta).second))
93      GrefB.reextent({nwalk, dm_dims(false, Beta).first, dm_dims(false, Beta).second});
94    if (QQ0A.shape() != std::make_tuple(nwalk, long(OrbMats[0].size(0)), NAEA))
95      QQ0A.reextent({nwalk, OrbMats[0].size(0), NAEA});
96    if (QQ0B.shape() != std::make_tuple(nwalk, long(OrbMats.back().size(0)), NAEB))
97      QQ0B.reextent({nwalk, OrbMats.back().size(0), NAEB});
98
99    for (int iw = 0, nc = 0; iw < nwalk; iw++)
100    {
101      if (nc % TG.TG_local().size() == TG.TG_local().rank())
102      {
103        boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(),
104                                                     {dm_dims_ref(false, Alpha).first,
105                                                      dm_dims_ref(false, Alpha).second});
106        Ovmsd[0][0][iw] = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), G2D_,
107                                                               LogOverlapFactor, refc, QQ0A[iw], true);
108        confg.resize(NAEA);
109        abij.get_configuration(0, 0, confg);
110        auto Gr = GrefA[iw];
111        std::fill_n(Gr.origin(), Gr.num_elements(), ComplexType(0.0));
112        for (int k = 0; k < confg.size(); ++k)
113          Gr[confg[k]] = G2D_[k];
114      }
115      ++nc;
116      if (nc % TG.TG_local().size() == TG.TG_local().rank())
117      {
118        boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(),
119                                                     {dm_dims_ref(false, Beta).first, dm_dims_ref(false, Beta).second});
120        Ovmsd[1][0][iw] =
121            SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(),
122                                                 *wset[iw].SlaterMatrix(SpinTypes((walker_type == CLOSED) ? 0 : 1)),
123                                                 G2D_, LogOverlapFactor, refc, QQ0B[iw], true);
124        confg.resize(NAEB);
125        abij.get_configuration(1, 0, confg);
126        auto Gr = GrefB[iw];
127        std::fill_n(Gr.origin(), Gr.num_elements(), ComplexType(0.0));
128        for (int k = 0; k < confg.size(); ++k)
129          Gr[confg[k]] = G2D_[k];
130      }
131      ++nc;
132    }
133    TG.local_barrier();
134    HamOp.fast_energy(Emsd, Ovmsd, GrefA, GrefB, QQ0A, QQ0B, Qwork, abij, det_couplings);
135  }
136  else
137  {
138    ComplexType ov0;
139    if (KEright.shape() != std::make_tuple(long(abij.number_of_unique_excitations()[0]), long(nwalk), long(nkev)))
140      KEright.reextent({abij.number_of_unique_excitations()[0], nwalk, nkev});
141    if (KEleft.shape() != std::make_tuple(long(nwalk), long(nkev)))
142      KEleft.reextent({nwalk, nkev});
143    for (int spin = 0; spin < nspins; ++spin)
144    {
145      int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0;
146      int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin;
147      confg.resize((spin == 0) ? NAEA : NAEB);
148      auto Gdims     = dm_dims(false, SpinTypes(spin));
149      auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin));
150      boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), {Gdims_ref.first, Gdims_ref.second});
151      int nr = Gdims.first * Gdims.second, nc = nwalk;
152      if (transposed_G_for_E_)
153        std::swap(nr, nc);
154      // GrefA is guaranteed to have enough space
155      boost::multi::array_ref<ComplexType, 2> G(to_address(GrefA.origin()), {nr, nc});
156      for (int nd = 0; nd < det_couplings[spin].size(); ++nd)
157      {
158        abij.get_configuration(spin, nd, confg);
159        // keeping this simple for now!
160        for (int iw = 0; iw < nwalk; iw++)
161        {
162          if (iw % TG.TG_local().size() == TG.TG_local().rank())
163          {
164            Ovmsd[spin][nd][iw] =
165                SDetOp.MixedDensityMatrixFromConfiguration(OrbMats[orb_spin_indx],
166                                                           *wset[iw].SlaterMatrix(SpinTypes(wlk_spin_indx)), G2D_,
167                                                           LogOverlapFactor, confg.data(), true);
168            if (transposed_G_for_E_)
169            {
170              boost::multi::array_ref<ComplexType, 3> G3D(to_address(G.origin()), {nwalk, Gdims.first, Gdims.second});
171              std::fill_n(G[iw].origin(), Gdims.first * Gdims.second, ComplexType(0.0));
172              for (int k = 0; k < confg.size(); ++k)
173                G3D[iw][confg[k]] = G2D_[k];
174            }
175            else
176            {
177              boost::multi::array_ref<ComplexType, 3> G3D(to_address(G.origin()), {Gdims.first, Gdims.second, nwalk});
178              for (int k = 0; k < Gdims.first; ++k)
179                for (int j = 0; j < Gdims.second; ++j)
180                  G3D[k][j][iw] = ComplexType(0.0);
181              for (int k = 0; k < confg.size(); ++k)
182                G3D[confg[k]]({0, Gdims.second}, iw) = G2D_[k];
183            }
184          }
185        }
186        TG.local_barrier();
187        if (spin == 0)
188        {
189          auto KEr = KEright[nd];
190          HamOp.energy(eloc2, G, orb_spin_indx, dummyMatPtr, std::addressof(KEr), TG.TG_local().root(), true, true);
191          // reduce_n since Emsd is in shared memory (doing this instead of copy with mutex)
192          TG.TG_local().reduce_n(to_address(eloc2.origin()), 3 * nwalk, to_address(Emsd[spin][nd].origin()),
193                                 std::plus<>(), 0);
194          //std::cout<<" E: " <<spin <<" " <<nd <<" " <<Ovmsd[spin][nd][0] <<" "
195          //<<eloc2[0][0] <<" "
196          //<<eloc2[0][1] <<" "
197          //<<eloc2[0][2] <<std::endl;
198          TG.local_barrier();
199        }
200        else
201        {
202          HamOp.energy(eloc2, G, orb_spin_indx, std::addressof(KEleft), dummyMatPtr, TG.TG_local().root(), true, true);
203          //std::cout<<" E: " <<spin <<" " <<nd <<" " <<Ovmsd[spin][nd][0] <<" "
204          //<<eloc2[0][0] <<" "
205          //<<eloc2[0][1] <<" "
206          //<<eloc2[0][2] <<std::endl;
207          // reduce_n since Emsd is in shared memory (doing this instead of copy with mutex)
208          TG.TG_local().reduce_n(to_address(eloc2.origin()), 3 * nwalk, to_address(Emsd[spin][nd].origin()),
209                                 std::plus<>(), 0);
210          TG.local_barrier();
211          // round-robin KE contributions
212          // iterators to configurations containing this beta configuration
213          auto it  = to_address(det_couplings[1].values()) + (*det_couplings[1].pointers_begin(nd));
214          auto ite = to_address(det_couplings[1].values()) + (*det_couplings[1].pointers_end(nd));
215          int nt   = 0;
216          for (; it < ite; ++it)
217          {
218            for (int iw = 0; iw < nwalk; ++iw, ++nt)
219            {
220              if (nt % TG.TG_local().size() == TG.TG_local().rank())
221              {
222                size_t nd_alp = get<0>(*(confgs + (*it)));
223                auto w_       = ma::conj(get<2>(*(confgs + (*it)))) * Ovmsd[0][nd_alp][iw] * Ovmsd[1][nd][iw];
224                opSpinEJ[iw] += w_ * static_cast<ComplexType>(ma::dot(KEleft[iw], KEright[nd_alp][iw]));
225              }
226            }
227          }
228        }
229      }
230    }
231  }
232  TG.local_barrier();
233
234  // 2. assemble sum over configurations
235  int nc = 0;
236  for (int spin = 0; spin < nspins; ++spin)
237  {
238    for (int nd = 0; nd < det_couplings[spin].size(); ++nd, ++nc)
239    {
240      if (nc % TG.TG_local().size() == TG.TG_local().rank())
241      {
242        auto it  = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_begin(nd));
243        auto ite = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_end(nd));
244        std::fill_n(wgt.origin(), nwalk, ComplexType(0.0));
245        if (spin == 0)
246        {
247          for (; it < ite; ++it)
248          {
249            auto ci     = ma::conj(get<2>(*(confgs + (*it))));
250            auto Ovmsd_ = Ovmsd[1][get<1>(*(confgs + (*it)))];
251            for (int iw = 0; iw < nwalk; ++iw)
252            {
253              wgt[iw] += ci * Ovmsd_[iw];
254            }
255          }
256          for (int iw = 0; iw < nwalk; ++iw)
257          {
258            wgt[iw] *= Ovmsd[0][nd][iw];
259            Ov[iw] += wgt[iw];
260          }
261        }
262        else
263        {
264          for (; it < ite; ++it)
265          {
266            auto ci     = ma::conj(get<2>(*(confgs + (*it))));
267            auto Ovmsd_ = Ovmsd[0][get<0>(*(confgs + (*it)))];
268            for (int iw = 0; iw < nwalk; ++iw)
269              wgt[iw] += ci * Ovmsd_[iw];
270          }
271          for (int iw = 0; iw < nwalk; ++iw)
272            wgt[iw] *= Ovmsd[1][nd][iw];
273        }
274        //Emsd[spin][nd_unique][iw][{0:E1, 1:EXX, 2:EJ}]
275        for (int iw = 0; iw < nwalk; ++iw)
276        { // remove scaling from CLOSED shell energy evaluation
277          E[iw][0] += wgt[iw] * Emsd[spin][nd][iw][0] / 2.0;
278          E[iw][1] += wgt[iw] * Emsd[spin][nd][iw][1] / 2.0;
279          E[iw][2] += wgt[iw] * Emsd[spin][nd][iw][2] / 4.0;
280        }
281      } //nd%TG.TG_local().size()==TG.TG_local().rank()
282    }   //nex
283  }     // spin
284
285  // 3. reduce over TG_local
286  TG.TG_local().all_reduce_in_place_n(to_address(opSpinEJ.origin()), nwalk, std::plus<>());
287  TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nwalk, std::plus<>());
288  TG.TG_local().all_reduce_in_place_n(to_address(E.origin()), 3 * nwalk, std::plus<>());
289  for (int i = 0; i < nwalk; ++i)
290  {
291    E[i][0] /= Ov[i];
292    E[i][1] /= Ov[i];
293    E[i][2] = (E[i][2] + opSpinEJ[i]) / Ov[i];
294  }
295  TG.local_barrier();
296}
297
298/*
299   * Calculates the local energy and overlaps of all the walkers in the set and
300   * returns them in the appropriate data structures
301   */
302template<class WlkSet, class Mat, class TVec>
303void PHMSD::Energy_distributed(const WlkSet& wset, Mat&& E, TVec&& Ov)
304{
305  APP_ABORT(" Error: Finish PHMSD::Energy_distributed. \n");
306  /*
307    //1. Calculate G and overlaps
308    //2. Loop over nodes in TG
309    // 2.a isend G to next node. irecv next G from "previous" node
310    // 2.b add local contribution to current G
311    // 2.c wait for comms to finish
312    //3. all reduce resulting energies
313
314    assert(ci.size()==1);
315    bool new_shm_space=false;
316    const int node_number = TG.getLocalGroupNumber();
317    const int nnodes = TG.getNGroupsPerTG();
318    const int Gsize = dm_size(false);
319    const ComplexType zero(0.0);
320    const int nwalk = wset.size();
321    // allocte space in shared memory for:
322    //  i.  2 copies of G (always compact),
323    //  ii. ovlps for local walkers
324    //  iii. energies[3] for all walkers on all nodes of TG (assume all nodes have same # of walkers)
325    int nt = nwalk*(2*Gsize+1);
326    if(not shmbuff_for_E) {
327      shmbuff_for_E = std::make_unique<SHM_Buffer>(TG.TG_local(),nt);
328      new_shm_space=true;
329    }
330    // in case the number of walkers changes
331    if(shmbuff_for_E->num_elements() < nt) {
332      shmbuff_for_E = std::make_unique<SHM_Buffer>(TG.TG_local(),nt);
333      new_shm_space=true;
334    }
335    assert(shmbuff_for_E->num_elements() >= nt);
336    assert(E.dimensionality==2);
337    assert(Ov.dimensionality==1);
338    assert(E.size(0)==wset.size());
339    assert(Ov.size(0)==wset.size());
340    assert(E.size(1)==3);
341
342    int nr=Gsize,nc=nwalk;
343    if(transposed_G_for_E_) std::swap(nr,nc);
344    int displ=0;
345    boost::multi::array_ref<ComplexType,2> Gwork(to_address(shmbuff_for_E->origin()),
346                                                {nr,nc});
347      displ += Gsize*nwalk;
348    boost::multi::array_ref<ComplexType,2> Grecv(to_address(shmbuff_for_E->origin())+displ,
349                                                {nr,nc});
350      displ += Gsize*nwalk;
351    boost::multi::array_ref<ComplexType,1> overlaps(to_address(shmbuff_for_E->origin())+displ,
352                                                   iextensions<1u>{nwalk});
353    if(eloc2.size(0) != nnodes*nwalk || eloc2.size(1) != 3)
354        eloc2.resize({nnodes*nwalk,3});
355    auto elocal = eloc2[node_number*nwalk];
356    int nak0,nak1;
357    std::tie(nak0,nak1) = FairDivideBoundary(TG.getLocalTGRank(),Gsize*nwalk,TG.getNCoresPerTG());
358
359    if(new_shm_space) {
360      // use mpi3 when ready
361      if(req_Grecv!=MPI_REQUEST_NULL)
362          MPI_Request_free(&req_Grecv);
363      if(req_Gsend!=MPI_REQUEST_NULL)
364          MPI_Request_free(&req_Gsend);
365      MPI_Send_init(Gwork.origin()+nak0,(nak1-nak0)*sizeof(ComplexType),MPI_CHAR,
366                    TG.prev_core(),1234,TG.TG().impl_,&req_Gsend);
367      MPI_Recv_init(Grecv.origin()+nak0,(nak1-nak0)*sizeof(ComplexType),MPI_CHAR,
368                    TG.next_core(),1234,TG.TG().impl_,&req_Grecv);
369    }
370
371    std::fill_n(eloc2.origin(),3*nnodes*nwalk,ComplexType(0.0));
372    TG.local_barrier();
373
374    MPI_Status st;
375
376    // calculate G for local walkers
377    MixedDensityMatrix_for_E(wset,Gwork,overlaps,0);
378
379    for(int k=0; k<nnodes; k++) {
380
381      // wait for G from node behind you, copy to Gwork
382      if(k>0) {
383        MPI_Wait(&req_Grecv,&st);
384        MPI_Wait(&req_Gsend,&st);     // need to wait for Gsend in order to overwrite Gwork
385        std::copy_n(Grecv.origin()+nak0,(nak1-nak0),Gwork.origin()+nak0);
386        TG.local_barrier();
387      }
388
389      // post send/recv messages with nodes ahead and behind you
390      if(k < nnodes-1) {
391        MPI_Start(&req_Gsend);
392        MPI_Start(&req_Grecv);
393      }
394
395      // calculate your contribution of the local enery to the set of walkers in Gwork
396      int q = (k+node_number)%nnodes;
397      HamOp.energy(eloc2.sliced(q*nwalk,(q+1)*nwalk),
398                   Gwork,0,TG.TG_local().root() && k==0);
399      TG.local_barrier();
400
401    }
402    TG.TG().all_reduce_in_place_n(eloc2.origin(),3*nnodes*nwalk,std::plus<>());
403    TG.local_barrier();
404    std::copy_n(elocal.origin(),3*nwalk,E.origin());
405    std::copy_n(overlaps.origin(),nwalk,Ov.origin());
406    TG.local_barrier();
407*/
408}
409
410/*
411   * This routine has (potentially) considerable overhead if either the number of determinants
412   *   or the number of walkers changes.
413   * G is assumed to be in shared memory
414   * Ov is assumed to be local to the core
415   */
416template<class WlkSet, class MatG, class TVec>
417void PHMSD::MixedDensityMatrix(const WlkSet& wset, MatG&& G, TVec&& Ov, bool compact, bool transpose)
418{
419  // if not compact, calculate compact on temporary storage and multiply by OrbMat[] on the left at the end.
420  using ma::T;
421  assert(G.stride(1) == 1);
422  assert(Ov.stride(0) == 1);
423  if (transpose)
424    assert(G.size(0) == wset.size() && G.size(1) == size_t(dm_size(not compact)));
425  else
426    assert(G.size(1) == wset.size() && G.size(0) == size_t(dm_size(not compact)));
427  const int nw = wset.size();
428  auto refc    = abij.reference_configuration();
429  double LogOverlapFactor(wset.getLogOverlapFactor());
430  assert(Ov.size() >= nw);
431  std::fill_n(Ov.begin(), nw, 0);
432  for (int i = 0; i < G.size(0); i++)
433    if (i % TG.TG_local().size() == TG.TG_local().rank())
434      std::fill_n(G[i].origin(), G.size(1), ComplexType(0.0));
435  TG.local_barrier();
436  auto Gsize = size_t(dm_size(not compact));
437  if (compact)
438  {
439    if (localGbuff.size() < 2 * Gsize) // 2 copies needed
440      localGbuff.reextent(iextensions<1u>{2 * Gsize});
441  }
442  else
443  {
444    if (localGbuff.size() < 3 * Gsize) // 3 copies needed
445      localGbuff.reextent(iextensions<1u>{3 * Gsize});
446  }
447  if (walker_type != COLLINEAR)
448  {
449    APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for CLOSED and NONCOLLINEAR. \n");
450  }
451  else
452  {
453    // always calculate compact and multiply by OrbMat at the end if full
454    auto Gsize_c     = size_t(dm_size(false));
455    auto GAdims      = dm_dims(false, Alpha);
456    auto GBdims      = dm_dims(false, Beta);
457    auto GAdims_full = dm_dims(true, Alpha);
458    auto GBdims_full = dm_dims(true, Beta);
459    if (compact)
460    {
461      GAdims_full = {0, 0};
462      GBdims_full = {0, 0};
463    }
464    auto GAdims0 = dm_dims_ref(false, Alpha);
465    auto GBdims0 = dm_dims_ref(false, Beta);
466    size_t cnt   = 0;
467    // REDUCE ALL THIS TEMPORARY STORAGE!!!
468    // storage for reference Green functions
469    boost::multi::array_ref<ComplexType, 2> GA2D0_(localGbuff.origin(), {GAdims0.first, GAdims0.second});
470    cnt += GA2D0_.num_elements();
471    boost::multi::array_ref<ComplexType, 2> GB2D0_(localGbuff.origin() + cnt, {GBdims0.first, GBdims0.second});
472    cnt += GB2D0_.num_elements();
473    // storage for Gw in case need to transpose result at the end
474    boost::multi::array_ref<ComplexType, 2> GA2D_(localGbuff.origin() + cnt, {GAdims.first, GAdims.second});
475    cnt += GA2D_.num_elements();
476    boost::multi::array_ref<ComplexType, 2> GB2D_(localGbuff.origin() + cnt, {GBdims.first, GBdims.second});
477    cnt += GB2D_.num_elements();
478    boost::multi::array_ref<ComplexType, 1> GA1D_(GA2D_.origin(), iextensions<1u>{GAdims.first * GAdims.second});
479    boost::multi::array_ref<ComplexType, 1> GB1D_(GB2D_.origin(), iextensions<1u>{GBdims.first * GBdims.second});
480    // storage for full G in case compact=false
481    boost::multi::array_ref<ComplexType, 2> Gfulla(localGbuff.origin() + cnt, {GAdims_full.first, GAdims_full.second});
482    cnt += Gfulla.num_elements();
483    boost::multi::array_ref<ComplexType, 2> Gfullb(localGbuff.origin() + cnt, {GBdims_full.first, GBdims_full.second});
484    cnt += Gfullb.num_elements();
485
486    const int ntasks_percore      = nw / TG.getNCoresPerTG();
487    const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG();
488    const int nextra              = nw - ntasks_total_serial;
489
490    // each processor does ntasks_percore_serial overlaps serially
491    const int w0 = TG.getLocalTGRank() * ntasks_percore;
492    const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore;
493
494    // task_w_d = = wlk_w*ndet + d
495    local_ov[0][0] = 1.0;
496    local_ov[1][0] = 1.0;
497    for (int iw = w0; iw < wN; ++iw)
498    {
499      // 1. calculate list of overlaps
500      ComplexType ov0 = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), GA2D0_,
501                                                             LogOverlapFactor, refc, local_QQ0inv0, true);
502      calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]);
503      ov0 *= SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), GB2D0_,
504                                                  LogOverlapFactor, refc + NAEA, local_QQ0inv1, true);
505      calculate_overlaps(0, 1, 1, abij, local_QQ0inv1, Qwork, local_ov[1]);
506      for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it)
507        Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] * local_ov[1][std::get<1>(*it)];
508
509      // 2. generate R[Nact,Nel] and generate G
510      boost::multi::array_ref<ComplexType, 2> Ra(Gwork.origin(), {NAEA, long(OrbMats[0].size(0))});
511      calculate_R(0, 1, 0, abij, det_couplings[0], local_QQ0inv0, Qwork, local_ov[1], ov0, Ra);
512      if (transpose)
513      {
514        if (compact)
515        {
516          boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), {GAdims.first, GAdims.second});
517          ma::product(T(Ra), GA2D0_, Gw);
518        }
519        else
520        {
521          boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()),
522                                                     {GAdims_full.first, GAdims_full.second});
523          ma::product(T(Ra), GA2D0_, GA2D_);
524          ma::product(T(OrbMats[0]), GA2D_, Gw);
525        }
526      }
527      else
528      {
529        if (compact)
530        {
531          ma::product(T(Ra), GA2D0_, GA2D_);
532          //G({0,GAdims.first*GAdims.second},iw) = GA1D_;
533          ma::copy(GA1D_, G({0, GAdims.first * GAdims.second}, iw));
534        }
535        else
536        {
537          boost::multi::array_ref<ComplexType, 1> G1D(Gfulla.origin(), iextensions<1u>{long(Gfulla.num_elements())});
538          ma::product(T(Ra), GA2D0_, GA2D_);
539          ma::product(T(OrbMats[0]), GA2D_, Gfulla);
540          ma::copy(G1D, G({0, Gfulla.num_elements()}, iw));
541          //G({0,Gfulla.num_elements()},iw) = G1D;
542        }
543      }
544
545      boost::multi::array_ref<ComplexType, 2> Rb(Gwork.origin(), {NAEB, long(OrbMats.back().size(0))});
546      calculate_R(0, 1, 1, abij, det_couplings[1], local_QQ0inv1, Qwork, local_ov[0], ov0, Rb);
547      if (transpose)
548      {
549        if (compact)
550        {
551          boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + GAdims.first * GAdims.second,
552                                                     {GBdims.first, GBdims.second});
553          ma::product(T(Rb), GB2D0_, Gw);
554        }
555        else
556        {
557          boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) +
558                                                         GAdims_full.first * GAdims_full.second,
559                                                     {GBdims_full.first, GBdims_full.second});
560          ma::product(T(Rb), GB2D0_, GB2D_);
561          ma::product(T(OrbMats.back()), GB2D_, Gw);
562        }
563      }
564      else
565      {
566        if (compact)
567        {
568          ma::product(T(Rb), GB2D0_, GB2D_);
569          //G({GAdims.first*GAdims.second,G.size(0)},iw) = GB1D_;
570          ma::copy(GB1D_, G({GAdims.first * GAdims.second, G.size(0)}, iw));
571        }
572        else
573        {
574          boost::multi::array_ref<ComplexType, 1> G1D(Gfullb.origin(), iextensions<1u>{Gfullb.num_elements()});
575          ma::product(T(Rb), GB2D0_, GB2D_);
576          ma::product(T(OrbMats.back()), GB2D_, Gfullb);
577          //G({Gfulla.num_elements(),G.size(0)},iw) = G1D;
578          ma::copy(G1D, G({Gfulla.num_elements(), G.size(0)}, iw));
579        }
580      }
581    }
582    // all remaining overlaps are performed in parallel with blocks of cores
583    // partition processors in nextra groups
584    if (nextra > 0)
585    {
586      // check if new communicator is necessary
587      if (last_number_extra_tasks != nextra)
588      {
589        last_number_extra_tasks = nextra;
590        for (int n = 0; n < nextra; n++)
591        {
592          int n0, n1;
593          std::tie(n0, n1) = FairDivideBoundary(n, TG.getNCoresPerTG(), nextra);
594          if (TG.getLocalTGRank() >= n0 && TG.getLocalTGRank() < n1)
595          {
596            last_task_index = n;
597            break;
598          }
599        }
600        // first setup
601        local_group_comm = shared_communicator(TG.TG_local().split(last_task_index, TG.TG_local().size()));
602        // this probably does not work!!!
603        {
604          shmCMatrix unique_overlaps_({2, maxn_unique_confg}, shared_allocator<ComplexType>{local_group_comm});
605          unique_overlaps.swap(unique_overlaps_);
606        }
607        {
608          shmCMatrix QQ0inv0_({OrbMats[0].size(0), NAEA}, shared_allocator<ComplexType>{local_group_comm});
609          QQ0inv0.swap(QQ0inv0_);
610        }
611        {
612          shmCMatrix QQ0inv1_({OrbMats.back().size(0), NAEB}, shared_allocator<ComplexType>{local_group_comm});
613          QQ0inv1.swap(QQ0inv1_);
614        }
615        {
616          shmCMatrix GA2D0_shm_({GAdims0.first, GAdims0.second}, shared_allocator<ComplexType>{local_group_comm});
617          GA2D0_shm.swap(GA2D0_shm_);
618        }
619        {
620          shmCMatrix GB2D0_shm_({GBdims0.first, GBdims0.second}, shared_allocator<ComplexType>{local_group_comm});
621          GB2D0_shm.swap(GB2D0_shm_);
622        }
623      }
624      if (last_task_index < 0 || last_task_index > nextra)
625        APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)");
626      {
627        if (local_group_comm.rank() == 0)
628          unique_overlaps[0][0] = 1.0;
629        if (local_group_comm.rank() == 0)
630          unique_overlaps[1][0] = 1.0;
631        local_group_comm.barrier();
632
633        int M0, Mn, sz = GAdims.second;
634        std::tie(M0, Mn) = FairDivideBoundary(local_group_comm.rank(), sz, local_group_comm.size());
635        int iw           = (last_task_index + ntasks_total_serial);
636        ComplexType ov0  = SDetOp.MixedDensityMatrixForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), GA2D0_shm,
637                                                               LogOverlapFactor, refc, QQ0inv0, local_group_comm, true);
638        calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 0, abij, QQ0inv0, Qwork,
639                           unique_overlaps[0]);
640        local_group_comm.barrier();
641        ov0 *= SDetOp.MixedDensityMatrixForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), GB2D0_shm,
642                                                    LogOverlapFactor, refc + NAEA, QQ0inv1, local_group_comm, true);
643        calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 1, abij, QQ0inv1, Qwork,
644                           unique_overlaps[1]);
645        local_group_comm.barrier();
646        size_t ic = 0;
647        for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it, ++ic)
648          if (ic % local_group_comm.size() == local_group_comm.rank())
649            Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * unique_overlaps[0][std::get<0>(*it)] *
650                unique_overlaps[1][std::get<1>(*it)];
651
652        // 2. generate R[Nact,Nel] and generate G
653        boost::multi::array_ref<ComplexType, 2> Ra(Gwork.origin(), {NAEA, long(OrbMats[0].size(0))});
654        calculate_R(local_group_comm.rank(), local_group_comm.size(), 0, abij, det_couplings[0], QQ0inv0, Qwork,
655                    unique_overlaps[1], ov0, Ra);
656        local_group_comm.all_reduce_in_place_n(to_address(Ra.origin()), Ra.num_elements(), std::plus<>());
657        if (transpose)
658        {
659          if (compact)
660          {
661            boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()), {GAdims.first, GAdims.second});
662            ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}), Gw(Gw.extension(0), {M0, Mn}));
663          }
664          else
665          {
666            boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()),
667                                                       {GAdims_full.first, GAdims_full.second});
668            ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}),
669                        GA2D_(GA2D_.extension(0), {M0, Mn}));               // can be local
670            ma::product(T(OrbMats[0]), GA2D_(GA2D_.extension(0), {M0, Mn}), // can be local
671                        Gw(Gw.extension(0), {M0, Mn}));
672          }
673        }
674        else
675        {
676          if (compact)
677          {
678            ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}),
679                        GA2D_(GA2D_.extension(0), {M0, Mn})); // can be local
680            boost::multi::array_ref<ComplexType, 3> Gw(to_address(G.origin()),
681                                                       {GAdims.first, GAdims.second, long(G.size(1))});
682            // copying by hand for now, implement strided copy in ma_blas
683            for (size_t k = 0; k < GA2D_.size(0); ++k)
684              for (size_t m = M0; m < Mn; ++m)
685                Gw[k][m][iw] = GA2D_[k][m];
686          }
687          else
688          {
689            ma::product(T(Ra), GA2D0_shm(GA2D0_shm.extension(0), {M0, Mn}),
690                        GA2D_(GA2D_.extension(0), {M0, Mn})); // can be local
691            ma::product(T(OrbMats[0]), GA2D_(GA2D_.extension(0), {M0, Mn}),
692                        Gfulla(Gfulla.extension(0), {M0, Mn})); // can be local
693            boost::multi::array_ref<ComplexType, 3> Gw(to_address(G.origin()),
694                                                       {long(Gfulla.size(0)), long(Gfulla.size(1)), long(G.size(1))});
695            // copying by hand for now, implement strided copy in ma_blas
696            for (size_t k = 0; k < Gfulla.size(0); ++k)
697              for (size_t m = M0; m < Mn; ++m)
698                Gw[k][m][iw] = Gfulla[k][m];
699          }
700        }
701
702        boost::multi::array_ref<ComplexType, 2> Rb(Gwork.origin(), {NAEB, long(OrbMats.back().size(0))});
703        calculate_R(local_group_comm.rank(), local_group_comm.size(), 1, abij, det_couplings[1], QQ0inv1, Qwork,
704                    unique_overlaps[0], ov0, Rb);
705        local_group_comm.all_reduce_in_place_n(to_address(Rb.origin()), Rb.num_elements(), std::plus<>());
706        if (transpose)
707        {
708          if (compact)
709          {
710            boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) + GAdims.first * GAdims.second,
711                                                       {GBdims.first, GBdims.second});
712            ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}), Gw(Gw.extension(0), {M0, Mn}));
713          }
714          else
715          {
716            boost::multi::array_ref<ComplexType, 2> Gw(to_address(G[iw].origin()) +
717                                                           GAdims_full.first * GAdims_full.second,
718                                                       {GBdims_full.first, GBdims_full.second});
719            ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}),
720                        GB2D_(GB2D_.extension(0), {M0, Mn}));                   // can be local
721            ma::product(T(OrbMats.back()), GB2D_(GB2D_.extension(0), {M0, Mn}), // can be local
722                        Gw(Gw.extension(0), {M0, Mn}));
723          }
724        }
725        else
726        {
727          if (compact)
728          {
729            ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}),
730                        GB2D_(GB2D_.extension(0), {M0, Mn})); // can be local
731            boost::multi::array_ref<ComplexType, 3> Gw(to_address(G[GAdims.first * GAdims.second].origin()),
732                                                       {GBdims.first, GBdims.second, long(G.size(1))});
733            // copying by hand for now, implement strided copy in ma_blas
734            for (size_t k = 0; k < GB2D_.size(0); ++k)
735              for (size_t m = M0; m < Mn; ++m)
736                Gw[k][m][iw] = GB2D_[k][m];
737          }
738          else
739          {
740            ma::product(T(Rb), GB2D0_shm(GB2D0_shm.extension(0), {M0, Mn}),
741                        GB2D_(GB2D_.extension(0), {M0, Mn})); // can be local
742            ma::product(T(OrbMats[0]), GB2D_(GB2D_.extension(0), {M0, Mn}),
743                        Gfullb(Gfullb.extension(0), {M0, Mn})); // can be local
744            boost::multi::array_ref<ComplexType, 3> Gw(to_address(G[Gfulla.num_elements()].origin()),
745                                                       {long(Gfullb.size(0)), long(Gfullb.size(1)), long(G.size(1))});
746            // copying by hand for now, implement strided copy in ma_blas
747            for (size_t k = 0; k < Gfullb.size(0); ++k)
748              for (size_t m = M0; m < Mn; ++m)
749                Gw[k][m][iw] = Gfullb[k][m];
750          }
751        }
752      }
753    }
754  }
755  // normalize G
756  TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nw, std::plus<>());
757  if (transpose)
758  {
759    for (size_t iw = 0; iw < G.size(0); ++iw)
760      if (iw % TG.TG_local().size() == TG.TG_local().rank())
761      {
762        auto ov_ = ComplexType(1.0, 0.0) / Ov[iw];
763        ma::scal(ov_, G[iw]);
764      }
765  }
766  else
767  {
768    auto Ov_         = Ov.origin();
769    const size_t nw_ = G.size(1);
770    for (int ik = 0; ik < G.size(0); ++ik)
771      if (ik % TG.TG_local().size() == TG.TG_local().rank())
772      {
773        auto Gik = to_address(G[ik].origin());
774        for (size_t iw = 0; iw < nw_; ++iw)
775          Gik[iw] /= Ov_[iw];
776      }
777  }
778  TG.local_barrier();
779}
780
781/*
782   * Computes the density matrix for a given reference.
783   * G and Ov are expected to be in shared memory.
784   * Simple round-robin is used.
785   */
786template<class WlkSet, class MatA, class MatB, class MatG, class TVec>
787void PHMSD::DensityMatrix_shared(const WlkSet& wset,
788                                 MatA&& RefA,
789                                 MatB&& RefB,
790                                 MatG&& G,
791                                 TVec&& Ov,
792                                 bool herm,
793                                 bool compact,
794                                 bool transposed)
795{
796  assert(G.stride(1) == 1);
797  assert(Ov.stride(0) == 1);
798  if (transposed)
799    assert(G.size(0) == wset.size() && G.size(1) == size_t(dm_size(not compact)));
800  else
801    assert(G.size(1) == wset.size() && G.size(0) == size_t(dm_size(not compact)));
802  const int nw = wset.size();
803  assert(Ov.size() >= nw);
804  // to force synchronization before modifying structures in SHM
805  TG.local_barrier();
806  fill_n(Ov.origin(), Ov.num_elements(), 0);
807  fill_n(G.origin(), G.num_elements(), ComplexType(0.0));
808  TG.local_barrier();
809  double LogOverlapFactor(wset.getLogOverlapFactor());
810  auto Gsize = size_t(dm_size(not compact));
811  if (localGbuff.size() < Gsize)
812    localGbuff.reextent(iextensions<1u>{Gsize});
813
814  if (walker_type != COLLINEAR)
815  {
816    if (herm)
817      assert(RefA.size(0) == dm_dims(false, Alpha).first && RefA.size(1) == dm_dims(false, Alpha).second);
818    else
819      assert(RefA.size(1) == dm_dims(false, Alpha).first && RefA.size(0) == dm_dims(false, Alpha).second);
820
821    auto Gdims = dm_dims(not compact, Alpha);
822    CMatrix_ref G2D_(localGbuff.origin(), {Gdims.first, Gdims.second});
823    CVector_ref G1D_(G2D_.origin(), iextensions<1u>{G2D_.num_elements()});
824
825    for (int iw = 0; iw < nw; ++iw)
826    {
827      if (iw % TG.TG_local().size() != TG.TG_local().rank())
828        continue;
829      Ov[iw] = SDetOp.MixedDensityMatrix(RefA, *wset[iw].SlaterMatrix(Alpha), G2D_, LogOverlapFactor, compact, herm);
830      if (walker_type == CLOSED)
831        Ov[iw] *= ComplexType(Ov[iw]);
832      if (transposed)
833        G[iw] = G1D_;
834      else
835        G(G.extension(0), iw) = G1D_;
836    }
837  }
838  else
839  {
840    if (herm)
841      assert(RefA.size(0) == dm_dims(false, Alpha).first && RefA.size(1) == dm_dims(false, Alpha).second);
842    else
843      assert(RefA.size(1) == dm_dims(false, Alpha).first && RefA.size(0) == dm_dims(false, Alpha).second);
844    if (herm)
845      assert(RefB.size(0) == dm_dims(false, Beta).first && RefB.size(1) == dm_dims(false, Beta).second);
846    else
847      assert(RefB.size(1) == dm_dims(false, Beta).first && RefB.size(0) == dm_dims(false, Beta).second);
848
849    if (ovlp2.size(0) < 2 * nw)
850      ovlp2.reextent(iextensions<1u>{2 * nw});
851    fill_n(ovlp2.origin(), 2 * nw, ComplexType(0.0));
852    auto GAdims = dm_dims(not compact, Alpha);
853    auto GBdims = dm_dims(not compact, Beta);
854    CMatrix_ref GA2D_(localGbuff.origin(), {GAdims.first, GAdims.second});
855    CMatrix_ref GB2D_(GA2D_.origin() + GA2D_.num_elements(), {GBdims.first, GBdims.second});
856    CVector_ref GA1D_(GA2D_.origin(), iextensions<1u>{GA2D_.num_elements()});
857    CVector_ref GB1D_(GB2D_.origin(), iextensions<1u>{GB2D_.num_elements()});
858
859    for (int iw = 0; iw < 2 * nw; ++iw)
860    {
861      if (iw % TG.TG_local().size() != TG.TG_local().rank())
862        continue;
863
864      if (iw % 2 == 0)
865      {
866        ovlp2[iw] =
867            SDetOp.MixedDensityMatrix(RefA, *wset[iw / 2].SlaterMatrix(Alpha), GA2D_, LogOverlapFactor, compact, herm);
868        if (transposed)
869          G[iw / 2].sliced(0, GAdims.first * GAdims.second) = GA1D_;
870        else
871          G({0, GAdims.first * GAdims.second}, iw / 2) = GA1D_;
872      }
873      else
874      {
875        ovlp2[iw] =
876            SDetOp.MixedDensityMatrix(RefB, *wset[iw / 2].SlaterMatrix(Beta), GB2D_, LogOverlapFactor, compact, herm);
877        if (transposed)
878          G[iw / 2].sliced(GAdims.first * GAdims.second, G.size(1)) = GB1D_;
879        else
880          G({GAdims.first * GAdims.second, G.size(0)}, iw / 2) = GB1D_;
881      }
882    }
883    if (TG.TG_local().size() > 1)
884      TG.TG_local().all_reduce_in_place_n(to_address(ovlp2.origin()), 2 * nw, std::plus<>());
885    if (TG.TG_local().root())
886      for (int iw = 0; iw < nw; ++iw)
887        Ov[iw] = ovlp2[2 * iw] * ovlp2[2 * iw + 1];
888  }
889  TG.local_barrier();
890}
891
892template<class MatA, class MatB, class MatG, class TVec>
893void PHMSD::DensityMatrix_shared(std::vector<MatA>& Left,
894                                 std::vector<MatB>& Right,
895                                 std::vector<MatG>& G,
896                                 TVec&& Ov,
897                                 double LogOverlapFactor,
898                                 bool herm,
899                                 bool compact)
900{
901  const int nw = Left.size();
902  assert(Right.size() == nw);
903  assert(G.size() == nw);
904  assert(Ov.size() >= nw);
905  // to force synchronization before modifying structures in SHM
906  TG.local_barrier();
907  for (int iw = 0; iw < nw; ++iw)
908  {
909    if (iw % TG.TG_local().size() != TG.TG_local().rank())
910      continue;
911    Ov[iw] = SDetOp.MixedDensityMatrix(*Left[iw], *Right[iw], *G[iw], LogOverlapFactor, compact, herm);
912  }
913  TG.local_barrier();
914}
915
916/*
917   * TODO: Implement.
918   */
919template<class WlkSet, class MatG, class CVec1, class CVec2, class Mat1, class Mat2>
920void PHMSD::WalkerAveragedDensityMatrix(const WlkSet& wset,
921                                        CVec1& wgt,
922                                        MatG& G,
923                                        CVec2& denom,
924                                        Mat1&& Ovlp,
925                                        Mat2&& DMsum,
926                                        bool free_projection,
927                                        boost::multi::array_ref<ComplexType, 3>* Refs,
928                                        boost::multi::array<ComplexType, 2>* detR)
929{
930  APP_ABORT(" Error: Back Propagation not implemented for PHMSD. \n");
931}
932
933/*
934   * Calculates the overlaps of all walkers in the set. Returns values in arrays.
935   * Ov is assumed to be local to the core
936   */
937template<class WlkSet, class TVec>
938void PHMSD::Overlap(const WlkSet& wset, TVec&& Ov)
939{
940  const int nw = wset.size();
941  assert(Ov.size() >= nw);
942  std::fill(Ov.begin(), Ov.begin() + nw, 0);
943  auto refc = abij.reference_configuration();
944  double LogOverlapFactor(wset.getLogOverlapFactor());
945  if (walker_type != COLLINEAR)
946  {
947    APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for CLOSED and NONCOLLINEAR. \n");
948    const int ntasks_percore      = nw / TG.getNCoresPerTG();
949    const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG();
950    const int nextra              = nw - ntasks_total_serial;
951
952    // each processor does ntasks_percore_serial overlaps serially
953    const int w0 = TG.getLocalTGRank() * ntasks_percore;
954    const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore;
955
956    // task_w_d = = wlk_w*ndet + d
957    ComplexType ov0;
958    for (int iw = w0; iw < wN; ++iw)
959    {
960      ov0 = SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc, local_QQ0inv0);
961      local_ov[0][0] = 1.0;
962      calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]);
963      for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it)
964      {
965        Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] *
966            ((walker_type == CLOSED) ? (ov0 * local_ov[0][std::get<0>(*it)]) : (ComplexType(1.0, 0.0)));
967      }
968    }
969    if (nextra > 0)
970    {
971      /*
972        // check if new communicator is necessary
973        if( last_number_extra_tasks != nextra ) {
974          last_number_extra_tasks = nextra;
975          for(int n=0; n<nextra; n++) {
976            int n0,n1;
977            std::tie(n0,n1) = FairDivideBoundary(n,TG.getNCoresPerTG(),nextra);
978            if(TG.getLocalTGRank()>=n0 && TG.getLocalTGRank()<n1) {
979              last_task_index = n;
980              break;
981            }
982          }
983          // first setup
984          local_group_comm = shared_communicator(TG.TG_local().split(last_task_index,
985                                                                               TG.TG_local().size()));
986          {
987            shmCMatrix _ov_({2,maxn_unique_confg},
988                                shared_allocator<ComplexType>{local_group_comm});
989            unique_overlaps.swap(_ov_);
990          }
991          {
992            shmC3Tensor _qq0inv_({1,maxnactive,size_t(NAEA)},
993                                    shared_allocator<ComplexType>{local_group_comm});
994            QQ0inv.swap(_qq0inv_);
995          }
996        }
997        if(last_task_index < 0 || last_task_index > nextra)
998          APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)");
999        {
1000          if(local_group_comm.rank()==0) unique_overlaps[0][0] = 1.0;
1001          local_group_comm.barrier();
1002          int iw = (last_task_index+ntasks_total_serial);
1003          ComplexType ov;
1004          SDetOp.OverlapForWoodbury(OrbMats[0],*wset[iw].SlaterMatrix(Alpha),std::addressof(ov),refc,QQ0inv0,local_group_comm);
1005          calculate_overlaps(local_group_comm.rank(),local_group_comm.size(),0,abij,QQ0inv0,Qwork,unique_overlaps[0]);
1006          local_group_comm.barrier();
1007          int cnt=0;
1008          for(auto it=abij.configurations_begin(); it<abij.configurations_end(); ++it, cnt++)
1009            if(cnt%local_group_comm.size()==local_group_comm.rank())
1010              Ov[iw] += ma::conj(std::get<2>(*it))*ov*
1011                    unique_overlaps[0][std::get<0>(*it)]*
1012                    ((walker_type==CLOSED)?(ov*unique_overlaps[0][std::get<0>(*it)]):(ComplexType(1.0,0.0)));
1013        }
1014*/
1015    }
1016  }
1017  else
1018  {
1019    const int ntasks_percore      = nw / TG.getNCoresPerTG();
1020    const int ntasks_total_serial = ntasks_percore * TG.getNCoresPerTG();
1021    const int nextra              = nw - ntasks_total_serial;
1022
1023    // each processor does ntasks_percore_serial overlaps serially
1024    const int w0 = TG.getLocalTGRank() * ntasks_percore;
1025    const int wN = (TG.getLocalTGRank() + 1) * ntasks_percore;
1026
1027    // task_w_d = = wlk_w*ndet + d
1028    local_ov[0][0] = 1.0;
1029    local_ov[1][0] = 1.0;
1030    for (int iw = w0; iw < wN; ++iw)
1031    {
1032      ComplexType ov0 =
1033          SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc, local_QQ0inv0);
1034      calculate_overlaps(0, 1, 0, abij, local_QQ0inv0, Qwork, local_ov[0]);
1035      ov0 *= SDetOp.OverlapForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), LogOverlapFactor, refc + NAEA,
1036                                       local_QQ0inv1);
1037      calculate_overlaps(0, 1, 1, abij, local_QQ0inv1, Qwork, local_ov[1]);
1038      for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it)
1039      {
1040        Ov[iw] += ma::conj(std::get<2>(*it)) * ov0 * local_ov[0][std::get<0>(*it)] * local_ov[1][std::get<1>(*it)];
1041      }
1042    }
1043
1044    // all remaining overlaps are performed in parallel with blocks of cores
1045    // partition processors in nextra groups
1046    if (nextra > 0)
1047    {
1048      // check if new communicator is necessary
1049      if (last_number_extra_tasks != nextra)
1050      {
1051        last_number_extra_tasks = nextra;
1052        for (int n = 0; n < nextra; n++)
1053        {
1054          int n0, n1;
1055          std::tie(n0, n1) = FairDivideBoundary(n, TG.getNCoresPerTG(), nextra);
1056          if (TG.getLocalTGRank() >= n0 && TG.getLocalTGRank() < n1)
1057          {
1058            last_task_index = n;
1059            break;
1060          }
1061        }
1062        // reset
1063        local_group_comm = shared_communicator(TG.TG_local().split(last_task_index, TG.TG_local().size()));
1064        auto GAdims0     = dm_dims_ref(false, Alpha);
1065        auto GBdims0     = dm_dims_ref(false, Beta);
1066        {
1067          shmCMatrix unique_overlaps_({2, maxn_unique_confg}, shared_allocator<ComplexType>{local_group_comm});
1068          unique_overlaps.swap(unique_overlaps_);
1069        }
1070        {
1071          shmCMatrix QQ0inv0_({OrbMats[0].size(0), NAEA}, shared_allocator<ComplexType>{local_group_comm});
1072          QQ0inv0.swap(QQ0inv0_);
1073        }
1074        {
1075          shmCMatrix QQ0inv1_({OrbMats.back().size(0), NAEB}, shared_allocator<ComplexType>{local_group_comm});
1076          QQ0inv1.swap(QQ0inv1_);
1077        }
1078        // don't need them here, but allocate anyway to avoid dimension check every time
1079        {
1080          shmCMatrix GA2D0_shm_({GAdims0.first, GAdims0.second}, shared_allocator<ComplexType>{local_group_comm});
1081          GA2D0_shm.swap(GA2D0_shm_);
1082        }
1083        {
1084          shmCMatrix GB2D0_shm_({GBdims0.first, GBdims0.second}, shared_allocator<ComplexType>{local_group_comm});
1085          GB2D0_shm.swap(GB2D0_shm_);
1086        }
1087        local_group_comm.barrier();
1088      }
1089      if (last_task_index < 0 || last_task_index > nextra)
1090        APP_ABORT("Error: Problems in PHMSD::Overlap(WSet,Ov)");
1091      {
1092        if (local_group_comm.rank() == 0)
1093          unique_overlaps[0][0] = 1.0;
1094        if (local_group_comm.rank() == 0)
1095          unique_overlaps[1][0] = 1.0;
1096        local_group_comm.barrier();
1097        int iw         = (last_task_index + ntasks_total_serial);
1098        ComplexType ov = SDetOp.OverlapForWoodbury(OrbMats[0], *wset[iw].SlaterMatrix(Alpha), LogOverlapFactor, refc,
1099                                                   QQ0inv0, local_group_comm);
1100        calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 0, abij, QQ0inv0, Qwork,
1101                           unique_overlaps[0]);
1102        ov *= SDetOp.OverlapForWoodbury(OrbMats.back(), *wset[iw].SlaterMatrix(Beta), LogOverlapFactor, refc + NAEA,
1103                                        QQ0inv1, local_group_comm);
1104        calculate_overlaps(local_group_comm.rank(), local_group_comm.size(), 1, abij, QQ0inv1, Qwork,
1105                           unique_overlaps[1]);
1106        local_group_comm.barrier();
1107        size_t cnt = 0;
1108        for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it, cnt++)
1109        {
1110          if (cnt % local_group_comm.size() == local_group_comm.rank())
1111            Ov[iw] += ma::conj(std::get<2>(*it)) * ov * unique_overlaps[0][std::get<0>(*it)] *
1112                unique_overlaps[1][std::get<1>(*it)];
1113        }
1114      }
1115    }
1116  }
1117  TG.TG_local().all_reduce_in_place_n(to_address(Ov.origin()), nw, std::plus<>());
1118}
1119
1120/*
1121   * Orthogonalizes the Slater matrices of all walkers in the set.
1122   * Options:
1123   *  - bool importanceSamplingt(default=true): use algorithm appropriate for importance sampling.
1124   *         This means that the determinant of the R matrix in the QR decomposition is ignored.
1125   *         If false, add the determinant of R to the weight of the walker.
1126   */
1127template<class WlkSet>
1128void PHMSD::Orthogonalize(WlkSet& wset, bool impSamp)
1129{
1130  ComplexType detR(1.0, 0.0);
1131  double LogOverlapFactor(wset.getLogOverlapFactor());
1132  if (walker_type != COLLINEAR)
1133  {
1134    int cnt = 0;
1135    for (typename WlkSet::iterator it = wset.begin(); it != wset.end(); ++it)
1136    {
1137      if ((cnt++) % TG.getNCoresPerTG() == TG.getLocalTGRank())
1138      {
1139        if (excitedState && numExcitations.first > 0)
1140          OrthogonalizeExcited(*it->SlaterMatrix(Alpha), Alpha, LogOverlapFactor);
1141        else
1142          detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Alpha), LogOverlapFactor);
1143        if (!impSamp)
1144        {
1145          if (walker_type == CLOSED)
1146            *it->weight() *= (detR * detR);
1147          else
1148            *it->weight() *= detR;
1149        }
1150      }
1151    }
1152  }
1153  else
1154  {
1155    int cnt = 0;
1156    for (typename WlkSet::iterator it = wset.begin(); it != wset.end(); ++it)
1157    {
1158      if ((2 * (cnt++)) % TG.getNCoresPerTG() == TG.getLocalTGRank())
1159      {
1160        if (excitedState && numExcitations.first > 0)
1161          OrthogonalizeExcited(*it->SlaterMatrix(Alpha), Alpha, LogOverlapFactor);
1162        else
1163          detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Alpha), LogOverlapFactor);
1164        if (!impSamp)
1165        {
1166          std::lock_guard<shared_mutex> guard(*mutex);
1167          *it->weight() *= detR;
1168        }
1169      }
1170      if ((2 * (cnt++) + 1) % TG.getNCoresPerTG() == TG.getLocalTGRank())
1171      {
1172        if (excitedState && numExcitations.second > 0)
1173          OrthogonalizeExcited(*it->SlaterMatrix(Beta), Beta, LogOverlapFactor);
1174        else
1175          detR = SDetOp.Orthogonalize(*it->SlaterMatrix(Beta), LogOverlapFactor);
1176        if (!impSamp)
1177        {
1178          std::lock_guard<shared_mutex> guard(*mutex);
1179          *it->weight() *= detR;
1180        }
1181      }
1182    }
1183  }
1184  TG.local_barrier();
1185  // recalculate overlaps
1186  Overlap(wset);
1187}
1188
1189/*
1190   * Orthogonalize extended Slater Matrix for excited states calculation
1191   * Ret
1192   */
1193template<class Mat>
1194void PHMSD::OrthogonalizeExcited(Mat&& A, SpinTypes spin, double LogOverlapFactor)
1195{
1196  if (walker_type == NONCOLLINEAR)
1197    APP_ABORT(" Error: OrthogonalizeExcited not implemented with NONCOLLINEAR.\n");
1198  if (spin == Alpha)
1199  {
1200    if (extendedMatAlpha.size(0) != NMO || extendedMatAlpha.size(1) != maxOccupExtendedMat.first)
1201      extendedMatAlpha.reextent({NMO, maxOccupExtendedMat.first});
1202    extendedMatAlpha(extendedMatAlpha.extension(0), {0, NAEA}) = A;
1203    extendedMatAlpha(extendedMatAlpha.extension(0), {NAEA + 1, maxOccupExtendedMat.first}) =
1204        excitedOrbMat[0](excitedOrbMat.extension(1), {NAEA + 1, maxOccupExtendedMat.first});
1205    // move i->a, copy trial orb i
1206    for (auto& i : excitations)
1207      if (i.first < NMO && i.second < NMO)
1208      {
1209        extendedMatAlpha(extendedMatAlpha.extension(0), i.second) =
1210            extendedMatAlpha(extendedMatAlpha.extension(0), i.first);
1211        extendedMatAlpha(extendedMatAlpha.extension(0), i.first) =
1212            excitedOrbMat[0](excitedOrbMat.extension(1), i.first);
1213      }
1214    ComplexType det              = SDetOp.Orthogonalize(extendedMatAlpha, LogOverlapFactor);
1215    A(A.extension(0), {0, NAEA}) = extendedMatAlpha(extendedMatAlpha.extension(0), {0, NAEA});
1216    for (auto& i : excitations)
1217      if (i.first < NMO && i.second < NMO)
1218        A(A.extension(0), i.first) = extendedMatAlpha(extendedMatAlpha.extension(0), i.second);
1219  }
1220  else
1221  {
1222    if (extendedMatBeta.size(0) != NMO || extendedMatBeta.size(1) != maxOccupExtendedMat.second)
1223      extendedMatBeta.reextent({NMO, maxOccupExtendedMat.second});
1224    extendedMatBeta(extendedMatBeta.extension(0), {0, NAEB}) = A;
1225    extendedMatBeta(extendedMatBeta.extension(0), {NAEB + 1, maxOccupExtendedMat.second}) =
1226        excitedOrbMat[1](excitedOrbMat.extension(1), {NAEB + 1, maxOccupExtendedMat.second});
1227    // move i->a, copy trial orb i
1228    for (auto& i : excitations)
1229      if (i.first >= NMO && i.second >= NMO)
1230      {
1231        extendedMatBeta(extendedMatBeta.extension(0), i.second) =
1232            extendedMatBeta(extendedMatBeta.extension(0), i.first);
1233        extendedMatBeta(extendedMatBeta.extension(0), i.first) = excitedOrbMat[1](excitedOrbMat.extension(1), i.first);
1234      }
1235    ComplexType detR             = SDetOp.Orthogonalize(extendedMatBeta, LogOverlapFactor);
1236    A(A.extension(0), {0, NAEB}) = extendedMatBeta(extendedMatBeta.extension(0), {0, NAEB});
1237    for (auto& i : excitations)
1238      if (i.first >= NMO && i.second >= NMO)
1239        A(A.extension(0), i.first) = extendedMatBeta(extendedMatBeta.extension(0), i.second);
1240  }
1241}
1242
1243/*
1244   * Calculate mean field expectation value of Cholesky potentials
1245   * Can put G in shared memory with proper synchronization to avoid duplicated memory
1246   * at the expense of synchronization overhead
1247   */
1248template<class Vec>
1249void PHMSD::vMF(Vec&& v)
1250{
1251  if (walker_type == NONCOLLINEAR)
1252    APP_ABORT(" Error: Finish implementation of PHMSD::MixedDensityMatrix for NONCOLLINEAR. \n");
1253
1254  using ma::conj;
1255  using ma::T;
1256  using std::get;
1257  using std::norm;
1258  assert(v.num_elements() == local_number_of_cholesky_vectors());
1259  std::fill_n(v.origin(), v.num_elements(), ComplexType(0));
1260  auto Gsize = size_t(dm_size(false));
1261  if (localGbuff.size() < Gsize)
1262    localGbuff.reextent(iextensions<1u>{Gsize});
1263
1264  //shmCMatrix ovlps({2,maxn_unique_confg},shared_allocator<ComplexType>{TG.Node()});
1265  //boost::multi::array<ComplexType,2> ovlps({2,maxn_unique_confg});
1266  boost::multi::array<ComplexType, 2> ovlps({2, maxn_unique_confg});
1267  //if(TG.Node().root())
1268  std::fill_n(to_address(ovlps.origin()), 2 * maxn_unique_confg, ComplexType(0.0));
1269
1270  auto refc   = abij.reference_configuration();
1271  auto confgs = abij.configurations_begin();
1272  std::vector<int> exct(2 * NAEA);
1273  std::vector<int> Iwork(2 * NAEA);
1274  std::vector<int> confg(NAEA);
1275  std::vector<int> confgB(NAEA);
1276  TG.Node().barrier();
1277
1278  // 1. Overlaps
1279  for (int spin = 0, nc = 0; spin < 2; ++spin)
1280  {
1281    int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0;
1282    int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin;
1283    confg.resize((spin == 0) ? NAEA : NAEB);
1284    auto Gdims     = dm_dims(false, SpinTypes(spin));
1285    auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin));
1286    boost::multi::array<ComplexType, 2> SM_({Gdims_ref.second, Gdims_ref.first});
1287    for (int nd = 0; nd < det_couplings[spin].size(); ++nd, ++nc)
1288    {
1289      if (nc % TG.Global().size() == TG.Global().rank())
1290      {
1291        abij.get_configuration(spin, nd, confg);
1292        ma::Matrix2MA('H', OrbMats[orb_spin_indx], SM_, confg);
1293        ovlps[spin][nd] = SDetOp.Overlap(SM_, 0.0);
1294      }
1295    }
1296  }
1297  TG.Node().barrier();
1298  //if(TG.Node().root())
1299  //  TG.Cores().all_reduce_in_place_n(to_address(ovlps.origin()),ovlps.num_elements(),std::plus<>());
1300  TG.Global().all_reduce_in_place_n(to_address(ovlps.origin()), ovlps.num_elements(), std::plus<>());
1301  TG.Node().barrier();
1302
1303  ComplexType ov(0.0);
1304  for (auto it = abij.configurations_begin(); it < abij.configurations_end(); ++it)
1305    ov += norm(std::get<2>(*it)) * ovlps[0][std::get<0>(*it)] * ovlps[1][std::get<1>(*it)];
1306
1307  // 2. Diagonal and off-diagonal components
1308  for (int spin = 0; spin < 2; ++spin)
1309  {
1310    int other_spin    = 1 - spin;
1311    int orb_spin_indx = (OrbMats.size() == 2) ? spin : 0;
1312    int wlk_spin_indx = (walker_type == CLOSED) ? 0 : spin;
1313    confg.resize((spin == 0) ? NAEA : NAEB);
1314    auto Gdims     = dm_dims(false, SpinTypes(spin));
1315    auto Gdims_ref = dm_dims_ref(false, SpinTypes(spin));
1316    boost::multi::array_ref<ComplexType, 2> G2D_(localGbuff.origin(), {Gdims_ref.first, Gdims_ref.second});
1317    boost::multi::array_ref<ComplexType, 1> G1D_(G2D_.origin(), iextensions<1u>{G2D_.num_elements()});
1318    boost::multi::array<ComplexType, 2> SM_({Gdims_ref.second, Gdims_ref.first});
1319    // store mean field G
1320    boost::multi::array<ComplexType, 2> G({Gdims.first, Gdims.second});
1321    std::fill_n(G.origin(), G.num_elements(), ComplexType(0.0));
1322
1323    // diagonal contribution
1324    for (int nd = 0; nd < det_couplings[spin].size(); ++nd)
1325    {
1326      if (nd % TG.Global().size() == TG.Global().rank())
1327      {
1328        abij.get_configuration(spin, nd, confg);
1329        ma::Matrix2MA('H', OrbMats[orb_spin_indx], SM_, confg);
1330        ComplexType ov_ = SDetOp.MixedDensityMatrix(SM_, G2D_, 0.0, true);
1331        ComplexType wgt(0.0);
1332        auto it  = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_begin(nd));
1333        auto ite = to_address(det_couplings[spin].values()) + (*det_couplings[spin].pointers_end(nd));
1334        for (; it < ite; ++it)
1335          wgt += ovlps[other_spin][get_index(*(confgs + (*it)), other_spin)] * norm(get<2>(*(confgs + (*it))));
1336        wgt *= ovlps[spin][nd];
1337        for (int k = 0; k < confg.size(); ++k)
1338          ma::axpy(wgt, G2D_[k], G[confg[k]]);
1339      }
1340    }
1341    // off-diagonal contribution
1342    boost::multi::array<ComplexType, 2> orbs({2, Gdims.second});
1343    confgB.resize((spin == 0) ? NAEA : NAEB);
1344    ComplexType dummy(0.0);
1345    for (int nd = 0; nd < det_couplings[other_spin].size(); ++nd)
1346    {
1347      if (nd % TG.Global().size() == TG.Global().rank())
1348      {
1349        auto it1 = to_address(det_couplings[other_spin].values()) + (*det_couplings[other_spin].pointers_begin(nd));
1350        auto ite = to_address(det_couplings[other_spin].values()) + (*det_couplings[other_spin].pointers_end(nd));
1351        for (; it1 < ite; ++it1)
1352        {
1353          auto ci1   = get<2>(*(confgs + (*it1)));
1354          size_t cf1 = get_index(*(confgs + (*it1)), spin);
1355          abij.get_configuration(spin, cf1, confg);
1356          sort(confg.begin(), confg.end());
1357          for (auto it2 = it1 + 1; it2 < ite; ++it2)
1358          {
1359            size_t cf2 = get_index(*(confgs + (*it2)), spin);
1360            abij.get_configuration(spin, cf2, confgB);
1361            sort(confgB.begin(), confgB.end());
1362            exct.clear();
1363            int np = get_excitation_number(true, confg, confgB, exct, dummy, Iwork);
1364            if (np == 1)
1365            {
1366              ComplexType wgt = ma::conj(ci1) * get<2>(*(confgs + (*it2)));
1367              /*
1368                 * exct: [0]: location of orbital being excited, [1]: excited orbital
1369                 * confg[exct[0]]: occupied orbital being excited
1370                 * WARNING!!! Assumes orthogonal states, needs overlap factor!!!
1371                 * Either calculate it (expensive) or demand orthogonality!!!
1372                 */
1373              exct[0] = confg[exct[0]];
1374              ma::Matrix2MA('Z', OrbMats[orb_spin_indx], orbs, exct);
1375              ma::axpy(wgt, orbs[0], G[exct[1]]);
1376              ma::axpy(ma::conj(wgt), orbs[1], G[exct[0]]);
1377            }
1378          }
1379        }
1380      }
1381    }
1382    boost::multi::array_ref<ComplexType, 1> G1D(G.origin(), iextensions<1u>{G.num_elements()});
1383    TG.Global().all_reduce_in_place_n(to_address(G1D.origin()), G1D.num_elements(), std::plus<>());
1384    ma::scal(ComplexType(1.0 / ov), G1D);
1385    HamOp.vbias(G1D, std::forward<Vec>(v), 0.5, 1.0);
1386  }
1387  TG.Node().barrier();
1388
1389  // since v is not in shared memory, we need to reduce
1390  TG.TG_local().all_reduce_in_place_n(to_address(v.origin()), v.num_elements(), std::plus<>());
1391  // NOTE: since SpvnT is a truncated structure the complex part of vMF,
1392  //       which should be exactly zero, suffers from truncation errors.
1393  //       Set it to zero.
1394  ma::zero_complex_part(v);
1395  //    for(int i=0; i<v.num_elements(); i++)
1396  //      v[i] = ComplexType(real(v[i]),0.0);
1397}
1398
1399
1400} // namespace afqmc
1401} // namespace qmcplusplus
1402