1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 //    Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 //    Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15 
16 #ifndef QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_KP3INDEXFACTORIZATION_HPP
17 #define QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_KP3INDEXFACTORIZATION_HPP
18 
19 #include <vector>
20 #include <type_traits>
21 #include <mutex>
22 #include <random>
23 
24 #include "Configuration.h"
25 #include "AFQMC/config.h"
26 #include "mpi3/shared_communicator.hpp"
27 #include "mpi3/shm/mutex.hpp"
28 #include "multi/array.hpp"
29 #include "multi/array_ref.hpp"
30 #include "AFQMC/Numerics/ma_operations.hpp"
31 
32 #include "AFQMC/Utilities/type_conversion.hpp"
33 #include "AFQMC/Utilities/taskgroup.h"
34 #include "AFQMC/Utilities/myTimer.h"
35 
36 namespace qmcplusplus
37 {
38 namespace afqmc
39 {
40 class KP3IndexFactorization
41 {
42   using sp_pointer       = SPComplexType*;
43   using const_sp_pointer = SPComplexType const*;
44 
45   using IVector       = boost::multi::array<int, 1>;
46   using CVector       = boost::multi::array<ComplexType, 1>;
47   using SpVector      = boost::multi::array<SPComplexType, 1>;
48   using CMatrix       = boost::multi::array<ComplexType, 2>;
49   using CMatrix_cref  = boost::multi::array_cref<ComplexType, 2>;
50   using CMatrix_ref   = boost::multi::array_ref<ComplexType, 2>;
51   using CVector_ref   = boost::multi::array_ref<ComplexType, 1>;
52   using SpMatrix_cref = boost::multi::array_cref<SPComplexType, 2>;
53   using SpVector_ref  = boost::multi::array_ref<SPComplexType, 1>;
54   using SpMatrix_ref  = boost::multi::array_ref<SPComplexType, 2>;
55   using C3Tensor      = boost::multi::array<ComplexType, 3>;
56   using SpMatrix      = boost::multi::array<SPComplexType, 2>;
57   using Sp3Tensor     = boost::multi::array<SPComplexType, 3>;
58   using Sp3Tensor_ref = boost::multi::array_ref<SPComplexType, 3>;
59   using Sp4Tensor_ref = boost::multi::array_ref<SPComplexType, 4>;
60   using shmCVector    = boost::multi::array<ComplexType, 1, shared_allocator<ComplexType>>;
61   using shmCMatrix    = boost::multi::array<ComplexType, 2, shared_allocator<ComplexType>>;
62   using shmIMatrix    = boost::multi::array<int, 2, shared_allocator<int>>;
63   using shmC3Tensor   = boost::multi::array<ComplexType, 3, shared_allocator<ComplexType>>;
64   using shmSpVector   = boost::multi::array<SPComplexType, 1, shared_allocator<SPComplexType>>;
65   using shmSpMatrix   = boost::multi::array<SPComplexType, 2, shared_allocator<SPComplexType>>;
66   using shmSp3Tensor  = boost::multi::array<SPComplexType, 3, shared_allocator<SPComplexType>>;
67   using communicator  = boost::mpi3::shared_communicator;
68   using shared_mutex  = boost::mpi3::shm::mutex;
69   using this_t        = KP3IndexFactorization;
70 
71 public:
72   static const HamiltonianTypes HamOpType = KPFactorized;
getHamType() const73   HamiltonianTypes getHamType() const { return HamOpType; }
74 
KP3IndexFactorization(communicator & c_,WALKER_TYPES type,IVector && nopk_,IVector && ncholpQ_,IVector && kminus_,shmIMatrix && nelpk_,shmIMatrix && QKToK2_,shmC3Tensor && hij_,shmCMatrix && h1,std::vector<shmSpMatrix> && vik,std::vector<shmSpMatrix> && vak,std::vector<shmSpMatrix> && vbl,IVector && qqm_,shmC3Tensor && vn0_,std::vector<RealType> && gQ_,int nsampleQ_,ValueType e0_,int cv0,int gncv)75   KP3IndexFactorization(communicator& c_,
76                         WALKER_TYPES type,
77                         IVector&& nopk_,
78                         IVector&& ncholpQ_,
79                         IVector&& kminus_,
80                         shmIMatrix&& nelpk_,
81                         shmIMatrix&& QKToK2_,
82                         shmC3Tensor&& hij_,
83                         shmCMatrix&& h1,
84                         std::vector<shmSpMatrix>&& vik,
85                         std::vector<shmSpMatrix>&& vak,
86                         std::vector<shmSpMatrix>&& vbl,
87                         IVector&& qqm_,
88                         shmC3Tensor&& vn0_,
89                         std::vector<RealType>&& gQ_,
90                         int nsampleQ_,
91                         ValueType e0_,
92                         int cv0,
93                         int gncv)
94       : comm(std::addressof(c_)),
95         walker_type(type),
96         global_origin(cv0),
97         global_nCV(gncv),
98         local_nCV(0),
99         E0(e0_),
100         H1(std::move(hij_)),
101         haj(std::move(h1)),
102         nopk(std::move(nopk_)),
103         ncholpQ(std::move(ncholpQ_)),
104         kminus(std::move(kminus_)),
105         nelpk(std::move(nelpk_)),
106         QKToK2(std::move(QKToK2_)),
107         LQKikn(std::move(vik)),
108         LQKank(std::move(vak)),
109         LQKbnl(std::move(vbl)),
110         Qmap(std::move(qqm_)),
111         Q2vbias(Qmap.size()),
112         number_of_symmetric_Q(0),
113         vn0(std::move(vn0_)),
114         nsampleQ(nsampleQ_),
115         gQ(std::move(gQ_)),
116         Qwn({1, 1}, shared_allocator<SPComplexType>{*comm}),
117         generator(),
118         distribution(gQ.begin(), gQ.end()),
119         SM_TMats({1, 1}, shared_allocator<SPComplexType>{*comm}),
120         TMats({1, 1}),
121         mutex(0),
122         EQ(nopk.size() + 2)
123   {
124     mutex.reserve(ncholpQ.size());
125     for (int nQ = 0; nQ < ncholpQ.size(); nQ++)
126       mutex.emplace_back(std::make_unique<shared_mutex>(*comm));
127     std::fill_n(EQ.data(), EQ.size(), 0);
128     int nkpts = nopk.size();
129     // Defines behavior over Q vector:
130     //   <0: Ignore (handled by another TG)
131     //    0: Calculate, without rho^+ contribution
132     //   >0: Calculate, with rho^+ contribution. LQKbln data located at Qmap[Q]-1
133     number_of_symmetric_Q = 0;
134     local_nCV             = 0;
135     std::fill_n(Q2vbias.origin(), nkpts, 0);
136     for (int Q = 0; Q < nkpts; Q++)
137     {
138       if (Q > kminus[Q])
139       {
140         if (Qmap[kminus[Q]] == 0)
141         {
142           assert(Qmap[Q] == 0);
143           Q2vbias[Q] = 2 * local_nCV;
144           local_nCV += ncholpQ[Q];
145         }
146         else
147         {
148           assert(Qmap[kminus[Q]] < 0);
149           assert(Qmap[Q] < 0);
150         }
151       }
152       else if (Qmap[Q] >= 0)
153       {
154         Q2vbias[Q] = 2 * local_nCV;
155         local_nCV += ncholpQ[Q];
156         if (Qmap[Q] > 0)
157           number_of_symmetric_Q++;
158       }
159     }
160     for (int Q = 0; Q < nkpts; Q++)
161     {
162       if (Qmap[Q] > 0)
163       {
164         assert(Q == kminus[Q]);
165         assert(Qmap[Q] <= number_of_symmetric_Q);
166       }
167     }
168     comm->barrier();
169   }
170 
~KP3IndexFactorization()171   ~KP3IndexFactorization() {}
172 
173   KP3IndexFactorization(const KP3IndexFactorization& other) = delete;
174   KP3IndexFactorization& operator=(const KP3IndexFactorization& other) = delete;
175   KP3IndexFactorization(KP3IndexFactorization&& other)                 = default;
176   KP3IndexFactorization& operator=(KP3IndexFactorization&& other) = default;
177 
getOneBodyPropagatorMatrix(TaskGroup_ & TG,boost::multi::array<ComplexType,1> const & vMF)178   boost::multi::array<ComplexType, 2> getOneBodyPropagatorMatrix(TaskGroup_& TG,
179                                                                  boost::multi::array<ComplexType, 1> const& vMF)
180   {
181     int nkpts = nopk.size();
182     int NMO   = std::accumulate(nopk.begin(), nopk.end(), 0);
183     int npol  = (walker_type == NONCOLLINEAR) ? 2 : 1;
184 
185     // making a copy of vMF since it will be modified
186     shmCVector vMF_(iextensions<1u>{vMF.num_elements()}, shared_allocator<ComplexType>{*comm});
187     comm->barrier();
188     //if(comm->root())
189     {
190       using std::copy_n;
191       copy_n(vMF.origin(), vMF.num_elements(), vMF_.origin());
192     }
193     comm->barrier();
194 
195     boost::multi::array<ComplexType, 2> P0({NMO, NMO});
196     boost::multi::array_ref<ComplexType, 1> P0D(to_address(P0.origin()), {P0.num_elements()});
197     std::fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0));
198     vHS(vMF_, P0D);
199     if (TG.TG().size() > 1)
200       TG.TG().all_reduce_in_place_n(P0D.origin(), P0D.num_elements(), std::plus<>());
201 
202     boost::multi::array<ComplexType, 2> P1({npol * NMO, npol * NMO});
203     std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0));
204 
205     // add spin-dependent H1
206     for (int K = 0, nk0 = 0; K < nkpts; ++K)
207     {
208       for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
209       {
210         for (int p = 0; p < npol; ++p)
211           P1[p * NMO + I][p * NMO + I] += H1[K][p * nopk[K] + i][p * nopk[K] + i];
212         for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
213         {
214           for (int p = 0; p < npol; ++p)
215           {
216             P1[p * NMO + I][p * NMO + J] += H1[K][p * nopk[K] + i][p * nopk[K] + j];
217             P1[p * NMO + J][p * NMO + I] += H1[K][p * nopk[K] + j][p * nopk[K] + i];
218           }
219         }
220         if (walker_type == NONCOLLINEAR)
221         {
222           // offdiagonal piece
223           for (int j = 0, J = nk0; j < nopk[K]; j++, J++)
224           {
225             P1[I][NMO + J] += H1[K][i][nopk[K] + j];
226             P1[NMO + J][I] += H1[K][nopk[K] + j][i];
227           }
228         }
229       }
230       nk0 += nopk[K];
231     }
232 
233     // add P0 (diagonal in spin)
234     for (int p = 0; p < npol; ++p)
235       for (int I = 0; I < NMO; I++)
236         for (int J = 0; J < NMO; J++)
237           P1[p * NMO + I][p * NMO + J] += P0[I][J];
238 
239     // add vn0 (diagonal in spin)
240     for (int K = 0, nk0 = 0; K < nkpts; ++K)
241     {
242       for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
243       {
244         for (int p = 0; p < npol; ++p)
245           P1[p * NMO + I][p * NMO + I] += vn0[K][i][i];
246         for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
247         {
248           for (int p = 0; p < npol; ++p)
249           {
250             P1[p * NMO + I][p * NMO + J] += vn0[K][i][j];
251             P1[p * NMO + J][p * NMO + I] += vn0[K][j][i];
252           }
253         }
254       }
255       nk0 += nopk[K];
256     }
257 
258     using ma::conj;
259     // symmetrize
260     for (int I = 0; I < npol * NMO; I++)
261     {
262       for (int J = I + 1; J < npol * NMO; J++)
263       {
264         // This is really cutoff dependent!!!
265 #if defined(MIXED_PRECISION)
266         if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
267         {
268 #else
269         if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
270         {
271 #endif
272           app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
273           app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << std::endl;
274           //<< H1[K][i][j] << " "
275           //<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
276         }
277         P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
278         P1[J][I] = ma::conj(P1[I][J]);
279       }
280     }
281     return P1;
282   }
283 
284   template<class Mat, class MatB>
285   void energy(Mat&& E, MatB const& G, int k, bool addH1 = true, bool addEJ = true, bool addEXX = true)
286   {
287     MatB* Kr(nullptr);
288     MatB* Kl(nullptr);
289     energy(E, G, k, Kl, Kr, addH1, addEJ, addEXX);
290   }
291 
292   // KEleft and KEright must be in shared memory for this to work correctly
293   template<class Mat, class MatB, class MatC, class MatD>
294   void energy(Mat&& E,
295               MatB const& Gc,
296               int nd,
297               MatC* KEleft,
298               MatD* KEright,
299               bool addH1  = true,
300               bool addEJ  = true,
301               bool addEXX = true)
302   {
303     if (nsampleQ > 0)
304       energy_sampleQ(E, Gc, nd, KEleft, KEright, addH1, addEJ, addEXX);
305     else
306       energy_exact(E, Gc, nd, KEleft, KEright, addH1, addEJ, addEXX);
307   }
308 
309   // KEleft and KEright must be in shared memory for this to work correctly
310   template<class Mat, class MatB, class MatC, class MatD>
311   void energy_exact(Mat&& E,
312                     MatB const& Gc,
313                     int nd,
314                     MatC* KEleft,
315                     MatD* KEright,
316                     bool addH1  = true,
317                     bool addEJ  = true,
318                     bool addEXX = true)
319   {
320     int nkpts = nopk.size();
321     assert(E.size(1) >= 3);
322     assert(nd >= 0 && nd < nelpk.size());
323 
324     int nwalk     = Gc.size(1);
325     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
326     int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
327     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
328     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
329     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
330     int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts);
331     int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end());
332     int noccb_tot = 0;
333     if (walker_type == COLLINEAR)
334       noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0);
335     int getKr = KEright != nullptr;
336     int getKl = KEleft != nullptr;
337     if (E.size(0) != nwalk || E.size(1) < 3)
338       APP_ABORT(
339           " Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n");
340 
341     size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max * npol);
342     size_t cnt(0);
343     if (addEJ)
344     {
345 #if defined(MIXED_PRECISION)
346       mem_needs += 2 * nwalk * local_nCV;
347 #else
348       if (not getKr)
349         mem_needs += nwalk * local_nCV;
350       if (not getKl)
351         mem_needs += nwalk * local_nCV;
352 #endif
353     }
354     set_shm_buffer(mem_needs);
355 
356     // messy
357     SPComplexType *Krptr(nullptr), *Klptr(nullptr);
358     long Knr = 0, Knc = 0;
359     if (addEJ)
360     {
361       Knr = nwalk;
362       Knc = local_nCV;
363       cnt = 0;
364 #if defined(MIXED_PRECISION)
365       if (getKr)
366       {
367         assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV);
368         assert(KEright->stride(0) == KEright->size(1));
369       }
370 #else
371       if (getKr)
372       {
373         assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV);
374         assert(KEright->stride(0) == KEright->size(1));
375         Krptr = to_address(KEright->origin());
376       }
377       else
378 #endif
379       {
380         Krptr = to_address(SM_TMats.origin());
381         cnt += nwalk * local_nCV;
382       }
383 #if defined(MIXED_PRECISION)
384       if (getKl)
385       {
386         assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV);
387         assert(KEleft->stride(0) == KEleft->size(1));
388       }
389 #else
390       if (getKl)
391       {
392         assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV);
393         assert(KEleft->stride(0) == KEleft->size(1));
394         Klptr = to_address(KEleft->origin());
395       }
396       else
397 #endif
398       {
399         Klptr = to_address(SM_TMats.origin()) + cnt;
400         cnt += nwalk * local_nCV;
401       }
402       if (comm->root())
403         std::fill_n(Krptr, Knr * Knc, SPComplexType(0.0));
404       if (comm->root())
405         std::fill_n(Klptr, Knr * Knc, SPComplexType(0.0));
406     }
407     else if (getKr or getKl)
408     {
409       APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n");
410     }
411     SpMatrix_ref Kl(Klptr, {long(Knr), long(Knc)});
412     SpMatrix_ref Kr(Krptr, {long(Knr), long(Knc)});
413 
414     for (int n = 0; n < nwalk; n++)
415       std::fill_n(E[n].origin(), 3, ComplexType(0.));
416 
417     assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
418     boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot * npol, nmo_tot, nwalk});
419     boost::multi::array_cref<ComplexType, 3> G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1),
420                                                   {noccb_tot, nmo_tot, nwalk});
421 
422 
423     // with yet another mapping, it is possible to reduce the memory usage here!
424     // avoiding for now!
425     Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * npol * nmo_max * nocca_max});
426     GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max);
427     comm->barrier();
428 
429     // one-body contribution
430     // haj[ndet*nkpts][nocc*nmo]
431     if (addH1)
432     {
433       int na = 0, nk = 0, nb = 0;
434       for (int n = 0; n < nwalk; n++)
435         E[n][0] = E0;
436       for (int K = 0; K < nkpts; ++K)
437       {
438 #ifdef MIXED_PRECISION
439         // must use Gc since GKK is is SP
440         boost::multi::array_ref<ComplexType, 3> haj_K(to_address(haj[nd * nkpts + K].origin()),
441                                                       {nelpk[nd][K], npol, nopk[K]});
442         for (int a = 0; a < nelpk[nd][K]; ++a)
443           for (int pol = 0; pol < npol; ++pol)
444             ma::product(ComplexType(1.), ma::T(G3Da[(na + a) * npol + pol].sliced(nk, nk + nopk[K])), haj_K[a][pol],
445                         ComplexType(1.), E(E.extension(0), 0));
446         na += nelpk[nd][K];
447         if (walker_type == COLLINEAR)
448         {
449           boost::multi::array_ref<ComplexType, 2> haj_Kb(haj_K.origin() + haj_K.num_elements(),
450                                                          {nelpk[nd][nkpts + K], nopk[K]});
451           for (int b = 0; b < nelpk[nd][nkpts + K]; ++b)
452             ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b], ComplexType(1.),
453                         E(E.extension(0), 0));
454           nb += nelpk[nd][nkpts + K];
455         }
456         nk += nopk[K];
457 #else
458         // use GKK
459         nk = nopk[K];
460         {
461           na = nelpk[nd][K];
462           CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * npol * nk});
463           SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * npol * nk});
464           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0));
465         }
466         if (walker_type == COLLINEAR)
467         {
468           na = nelpk[nd][nkpts + K];
469           CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()) + nelpk[nd][K] * nk, {na * nk});
470           SpMatrix_ref Gaj(to_address(GKK[1][K][K].origin()), {nwalk, na * nk});
471           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0));
472         }
473 #endif
474       }
475     }
476 
477     if (addEXX)
478     {
479       size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + 2 * nchol_max * nwalk;
480       if (TMats.num_elements() < local_memory_needs)
481         TMats.reextent({local_memory_needs, 1});
482       cnt = 0;
483       SpMatrix_ref Kr_local(TMats.origin(), {nwalk, nchol_max});
484       cnt += Kr_local.num_elements();
485       SpMatrix_ref Kl_local(TMats.origin() + cnt, {nwalk, nchol_max});
486       cnt += Kl_local.num_elements();
487       std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0));
488       std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0));
489       SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0);
490       size_t nqk     = 1;
491       for (int Q = 0; Q < nkpts; ++Q)
492       {
493         if (Qmap[Q] < 0)
494           continue;
495         bool haveKE = false;
496         for (int Ka = 0; Ka < nkpts; ++Ka)
497         {
498           int K0 = ((Qmap[Q] > 0) ? 0 : Ka);
499           for (int Kb = K0; Kb < nkpts; ++Kb)
500           {
501             if ((nqk++) % comm->size() == comm->rank())
502             {
503               int nchol = ncholpQ[Q];
504               int Qm    = kminus[Q];
505               int Kl    = QKToK2[Qm][Kb];
506               int Kk    = QKToK2[Q][Ka];
507               int nl    = nopk[Kl];
508               int nb    = nelpk[nd][Kb];
509               int na    = nelpk[nd][Ka];
510               int nk    = nopk[Kk];
511 
512               SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na, npol * nl});
513               SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb, npol * nk});
514               SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, npol * nk});
515               auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin()));
516               if (Qmap[Q] > 0)
517                 bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin());
518               SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, npol * nl});
519 
520               SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol});
521               Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol});
522               SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol});
523               Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol});
524 
525               if (na > 0 && nb > 0)
526                 ma::product(Gwal, ma::T(Lbnl), Twabn);
527               if (na > 0 && nb > 0)
528                 ma::product(Gwbk, ma::T(Lank), Twban);
529 
530               for (int n = 0; n < nwalk; ++n)
531               {
532                 SPComplexType E_(0.0);
533                 for (int a = 0; a < na; ++a)
534                   for (int b = 0; b < nb; ++b)
535                     E_ += ma::dot(T4Dwabn[n][a][b], T4Dwban[n][b][a]);
536                 if (Qmap[Q] > 0 || Ka == Kb)
537                   E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_);
538                 else
539                   E[n][1] -= static_cast<ComplexType>(scl * E_);
540               }
541 
542               if (addEJ && Ka == Kb)
543               {
544                 haveKE = true;
545                 for (int n = 0; n < nwalk; ++n)
546                   for (int a = 0; a < na; ++a)
547                   {
548                     ma::axpy(SPComplexType(1.0), T4Dwban[n][a][a], Kl_local[n].sliced(0, nchol));
549                     ma::axpy(SPComplexType(1.0), T4Dwabn[n][a][a], Kr_local[n].sliced(0, nchol));
550                   }
551               }
552 
553             } // if
554 
555             if (walker_type == COLLINEAR)
556             {
557               if ((nqk++) % comm->size() == comm->rank())
558               {
559                 int nchol = ncholpQ[Q];
560                 int Qm    = kminus[Q];
561                 int Kl    = QKToK2[Qm][Kb];
562                 int Kk    = QKToK2[Q][Ka];
563                 int nl    = nopk[Kl];
564                 int nb    = nelpk[nd][nkpts + Kb];
565                 int na    = nelpk[nd][nkpts + Ka];
566                 int nk    = nopk[Kk];
567 
568                 SpMatrix_ref Gwal(GKK[1][Ka][Kl].origin(), {nwalk * na, nl});
569                 SpMatrix_ref Gwbk(GKK[1][Kb][Kk].origin(), {nwalk * nb, nk});
570                 SpMatrix_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na * nchol, nk});
571                 auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Kb].origin()));
572                 if (Qmap[Q] > 0)
573                   bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin());
574                 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl});
575 
576                 SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol});
577                 Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol});
578                 SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol});
579                 Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol});
580 
581                 if (na > 0 && nb > 0)
582                   ma::product(Gwal, ma::T(Lbnl), Twabn);
583                 if (na > 0 && nb > 0)
584                   ma::product(Gwbk, ma::T(Lank), Twban);
585 
586                 for (int n = 0; n < nwalk; ++n)
587                 {
588                   SPComplexType E_(0.0);
589                   for (int a = 0; a < na; ++a)
590                     for (int b = 0; b < nb; ++b)
591                       E_ += ma::dot(T4Dwabn[n][a][b], T4Dwban[n][b][a]);
592                   if (Qmap[Q] > 0 || Ka == Kb)
593                     E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_);
594                   else
595                     E[n][1] -= static_cast<ComplexType>(scl * E_);
596                 }
597 
598                 if (addEJ && Ka == Kb)
599                 {
600                   haveKE = true;
601                   for (int n = 0; n < nwalk; ++n)
602                     for (int a = 0; a < na; ++a)
603                     {
604                       ma::axpy(SPComplexType(1.0), T4Dwban[n][a][a], Kl_local[n].sliced(0, nchol));
605                       ma::axpy(SPComplexType(1.0), T4Dwabn[n][a][a], Kr_local[n].sliced(0, nchol));
606                     }
607                 }
608 
609               } // if
610             }   // COLLINEAR
611           }     // Kb
612         }       // Ka
613         if (addEJ && haveKE)
614         {
615           std::lock_guard<shared_mutex> guard(*mutex[Q]);
616           int nc0 = Q2vbias[Q] / 2; //std::accumulate(ncholpQ.begin(),ncholpQ.begin()+Q,0);
617           for (int n = 0; n < nwalk; n++)
618           {
619             ma::axpy(SPComplexType(1.0), Kr_local[n].sliced(0, ncholpQ[Q]), Kr[n].sliced(nc0, nc0 + ncholpQ[Q]));
620             ma::axpy(SPComplexType(1.0), Kl_local[n].sliced(0, ncholpQ[Q]), Kl[n].sliced(nc0, nc0 + ncholpQ[Q]));
621           }
622         } // to release the lock
623         if (addEJ && haveKE)
624         {
625           std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0));
626           std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0));
627         }
628       } // Q
629     }
630 
631     if (addEJ)
632     {
633       if (not addEXX)
634       {
635         // calculate Kr
636         APP_ABORT(" Error: Finish addEJ and not addEXX");
637       }
638       comm->barrier();
639       size_t nqk     = 0;
640       SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0);
641       for (int n = 0; n < nwalk; ++n)
642       {
643         for (int Q = 0; Q < nkpts; ++Q)
644         { // momentum conservation index
645           if (Qmap[Q] < 0)
646             continue;
647           if ((nqk++) % comm->size() == comm->rank())
648           {
649             int nc0 = Q2vbias[Q] / 2; //std::accumulate(ncholpQ.begin(),ncholpQ.begin()+Q,0);
650             E[n][2] += 0.5 *
651                 static_cast<ComplexType>(scl * scl *
652                                          ma::dot(Kl[n]({nc0, nc0 + ncholpQ[Q]}), Kr[n]({nc0, nc0 + ncholpQ[Q]})));
653           }
654         }
655       }
656 #if defined(MIXED_PRECISION)
657       if (getKl)
658       {
659         size_t i0, iN;
660         std::tie(i0, iN) =
661             FairDivideBoundary(size_t(comm->rank()), size_t(KEleft->num_elements()), size_t(comm->size()));
662         copy_n_cast(Klptr + i0, iN - i0, to_address(KEleft->origin()) + i0);
663       }
664       if (getKr)
665       {
666         size_t i0, iN;
667         std::tie(i0, iN) =
668             FairDivideBoundary(size_t(comm->rank()), size_t(KEright->num_elements()), size_t(comm->size()));
669         copy_n_cast(Krptr + i0, iN - i0, to_address(KEright->origin()) + i0);
670       }
671 #endif
672       comm->barrier();
673     }
674   }
675 
676   // KEleft and KEright must be in shared memory for this to work correctly
677   template<class Mat, class MatB, class MatC, class MatD>
678   void energy_sampleQ(Mat&& E,
679                       MatB const& Gc,
680                       int nd,
681                       MatC* KEleft,
682                       MatD* KEright,
683                       bool addH1  = true,
684                       bool addEJ  = true,
685                       bool addEXX = true)
686   {
687     APP_ABORT(" Error: Incomplete implementation. \n");
688     // need to finish modifications for distribution of Q
689 
690     int nkpts = nopk.size();
691     assert(E.size(1) >= 3);
692     assert(nd >= 0 && nd < nelpk.size());
693 
694     int nwalk     = Gc.size(1);
695     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
696     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
697     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
698     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
699     int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts);
700     int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end());
701     int noccb_tot = 0;
702     if (walker_type == COLLINEAR)
703       noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0);
704     int getKr = KEright != nullptr;
705     int getKl = KEleft != nullptr;
706     if (E.size(0) != nwalk || E.size(1) < 3)
707       APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions\n");
708 
709     size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max);
710     size_t cnt(0);
711     if (addEJ)
712     {
713 #if defined(MIXED_PRECISION)
714       mem_needs += 2 * nwalk * local_nCV;
715 #else
716       if (not getKr)
717         mem_needs += nwalk * local_nCV;
718       if (not getKl)
719         mem_needs += nwalk * local_nCV;
720 #endif
721     }
722     set_shm_buffer(mem_needs);
723 
724     // messy
725     SPComplexType *Krptr(nullptr), *Klptr(nullptr);
726     long Knr = 0, Knc = 0;
727     if (addEJ)
728     {
729       Knr = nwalk;
730       Knc = local_nCV;
731       cnt = 0;
732 #if defined(MIXED_PRECISION)
733       if (getKr)
734       {
735         assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV);
736         assert(KEright->stride(0) == KEright->size(1));
737       }
738 #else
739       if (getKr)
740       {
741         assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV);
742         assert(KEright->stride(0) == KEright->size(1));
743         Krptr = to_address(KEright->origin());
744       }
745       else
746 #endif
747       {
748         Krptr = to_address(SM_TMats.origin());
749         cnt += nwalk * local_nCV;
750       }
751 #if defined(MIXED_PRECISION)
752       if (getKl)
753       {
754         assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV);
755         assert(KEleft->stride(0) == KEleft->size(1));
756       }
757 #else
758       if (getKl)
759       {
760         assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV);
761         assert(KEleft->stride(0) == KEleft->size(1));
762         Klptr = to_address(KEleft->origin());
763       }
764       else
765 #endif
766       {
767         Klptr = to_address(SM_TMats.origin()) + cnt;
768         cnt += nwalk * local_nCV;
769       }
770       if (comm->root())
771         std::fill_n(Krptr, Knr * Knc, SPComplexType(0.0));
772       if (comm->root())
773         std::fill_n(Klptr, Knr * Knc, SPComplexType(0.0));
774     }
775     else if (getKr or getKl)
776     {
777       APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n");
778     }
779     SpMatrix_ref Kl(Klptr, {long(Knr), long(Knc)});
780     SpMatrix_ref Kr(Krptr, {long(Knr), long(Knc)});
781 
782     for (int n = 0; n < nwalk; n++)
783       std::fill_n(E[n].origin(), 3, ComplexType(0.));
784 
785     assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
786     boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot, nmo_tot, nwalk});
787     boost::multi::array_cref<ComplexType, 3> G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1),
788                                                   {noccb_tot, nmo_tot, nwalk});
789 
790     Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * nmo_max * nocca_max});
791     cnt += GKK.num_elements();
792     GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max);
793     comm->barrier();
794 
795     // one-body contribution
796     // haj[ndet*nkpts][nocc*nmo]
797     // not parallelized for now, since it would require customization of Wfn
798     if (addH1)
799     {
800       // must use Gc since GKK is is SP
801       int na = 0, nk = 0, nb = 0;
802       for (int n = 0; n < nwalk; n++)
803         E[n][0] = E0;
804       for (int K = 0; K < nkpts; ++K)
805       {
806 #ifdef MIXED_PRECISION
807         boost::multi::array_ref<ComplexType, 2> haj_K(to_address(haj[nd * nkpts + K].origin()),
808                                                       {nelpk[nd][K], nopk[K]});
809         for (int a = 0; a < nelpk[nd][K]; ++a)
810           ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a], ComplexType(1.),
811                       E(E.extension(0), 0));
812         na += nelpk[nd][K];
813         if (walker_type == COLLINEAR)
814         {
815           boost::multi::array_ref<ComplexType, 2> haj_Kb(haj_K.origin() + haj_K.num_elements(),
816                                                          {nelpk[nd][nkpts + K], nopk[K]});
817           for (int b = 0; b < nelpk[nd][nkpts + K]; ++b)
818             ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b], ComplexType(1.),
819                         E(E.extension(0), 0));
820           nb += nelpk[nd][nkpts + K];
821         }
822         nk += nopk[K];
823 #else
824         nk = nopk[K];
825         {
826           na = nelpk[nd][K];
827           CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * nk});
828           SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * nk});
829           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0));
830         }
831         if (walker_type == COLLINEAR)
832         {
833           na = nelpk[nd][nkpts + K];
834           CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()) + nelpk[nd][K] * nk, {na * nk});
835           SpMatrix_ref Gaj(to_address(GKK[1][K][K].origin()), {nwalk, na * nk});
836           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0));
837         }
838 #endif
839       }
840     }
841 
842     // move calculation of H1 here
843     // NOTE: For CLOSED/NONCOLLINEAR, can do all walkers simultaneously to improve perf. of GEMM
844     //       Not sure how to do it for COLLINEAR.
845     if (addEXX)
846     {
847       if (Qwn.size(0) != nwalk || Qwn.size(1) != nsampleQ)
848         Qwn.reextent({nwalk, nsampleQ});
849       comm->barrier();
850       if (comm->root())
851       {
852         for (int n = 0; n < nwalk; ++n)
853           for (int nQ = 0; nQ < nsampleQ; ++nQ)
854           {
855             Qwn[n][nQ] = distribution(generator);
856             /*
857               RealType drand = distribution(generator);
858               RealType s(0.0);
859               bool found=false;
860               for(int Q=0; Q<nkpts; Q++) {
861                 s += gQ[Q];
862                 if( drand < s ) {
863                   Qwn[n][nQ] = Q;
864                   found=true;
865                   break;
866                 }
867               }
868               if(not found)
869                 APP_ABORT(" Error: sampleQ Qwn. \n");
870 */
871           }
872       }
873       comm->barrier();
874 
875       size_t local_memory_needs = 2 * nocca_max * nocca_max * nchol_max;
876       if (TMats.num_elements() < local_memory_needs)
877         TMats.reextent({local_memory_needs, 1});
878       size_t local_cnt = 0;
879       SPRealType scl   = (walker_type == CLOSED ? 2.0 : 1.0);
880       size_t nqk       = 1;
881       for (int n = 0; n < nwalk; ++n)
882       {
883         for (int nQ = 0; nQ < nsampleQ; ++nQ)
884         {
885           int Q = Qwn[n][nQ];
886           for (int Ka = 0; Ka < nkpts; ++Ka)
887           {
888             for (int Kb = 0; Kb < nkpts; ++Kb)
889             {
890               if ((nqk++) % comm->size() == comm->rank())
891               {
892                 int nchol = ncholpQ[Q];
893                 int Qm    = kminus[Q];
894                 int Kl    = QKToK2[Qm][Kb];
895                 int Kk    = QKToK2[Q][Ka];
896                 int nl    = nopk[Kl];
897                 int nb    = nelpk[nd][Kb];
898                 int na    = nelpk[nd][Ka];
899                 int nk    = nopk[Kk];
900 
901                 SpMatrix_ref Gal(GKK[0][Ka][Kl].origin() + n * na * nl, {na, nl});
902                 SpMatrix_ref Gbk(GKK[0][Kb][Kk].origin() + n * nb * nk, {nb, nk});
903                 SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, nk});
904                 auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin()));
905                 if (Q == Qm)
906                   bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin());
907                 SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl});
908 
909                 SpMatrix_ref Tban(TMats.origin() + local_cnt, {nb, na * nchol});
910                 Sp3Tensor_ref T3Dban(TMats.origin() + local_cnt, {nb, na, nchol});
911                 SpMatrix_ref Tabn(Tban.origin() + Tban.num_elements(), {na, nb * nchol});
912                 Sp3Tensor_ref T3Dabn(Tban.origin() + Tban.num_elements(), {na, nb, nchol});
913 
914                 if (na > 0 && nb > 0)
915                   ma::product(Gal, ma::T(Lbnl), Tabn);
916                 if (na > 0 && nb > 0)
917                   ma::product(Gbk, ma::T(Lank), Tban);
918 
919                 SPComplexType E_(0.0);
920                 for (int a = 0; a < na; ++a)
921                   for (int b = 0; b < nb; ++b)
922                     E_ += ma::dot(T3Dabn[a][b], T3Dban[b][a]);
923                 E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_) / gQ[Q] / double(nsampleQ);
924 
925               } // if
926 
927               if (walker_type == COLLINEAR)
928               {
929                 if ((nqk++) % comm->size() == comm->rank())
930                 {
931                   int nchol = ncholpQ[Q];
932                   int Qm    = kminus[Q];
933                   int Kl    = QKToK2[Qm][Kb];
934                   int Kk    = QKToK2[Q][Ka];
935                   int nl    = nopk[Kl];
936                   int nb    = nelpk[nd][nkpts + Kb];
937                   int na    = nelpk[nd][nkpts + Ka];
938                   int nk    = nopk[Kk];
939 
940                   SpMatrix_ref Gal(GKK[1][Ka][Kl].origin() + n * na * nl, {na, nl});
941                   SpMatrix_ref Gbk(GKK[1][Kb][Kk].origin() + n * nb * nk, {nb, nk});
942                   SpMatrix_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na * nchol, nk});
943                   auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Kb].origin()));
944                   if (Q == Qm)
945                     bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin());
946                   SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl});
947 
948                   SpMatrix_ref Tban(TMats.origin() + local_cnt, {nb, na * nchol});
949                   Sp3Tensor_ref T3Dban(TMats.origin() + local_cnt, {nb, na, nchol});
950                   SpMatrix_ref Tabn(Tban.origin() + Tban.num_elements(), {na, nb * nchol});
951                   Sp3Tensor_ref T3Dabn(Tban.origin() + Tban.num_elements(), {na, nb, nchol});
952 
953                   if (na > 0 && nb > 0)
954                     ma::product(Gal, ma::T(Lbnl), Tabn);
955                   if (na > 0 && nb > 0)
956                     ma::product(Gbk, ma::T(Lank), Tban);
957 
958                   SPComplexType E_(0.0);
959                   for (int a = 0; a < na; ++a)
960                     for (int b = 0; b < nb; ++b)
961                       E_ += ma::dot(T3Dabn[a][b], T3Dban[b][a]);
962                   E[n][1] -= 0.5 * static_cast<ComplexType>(scl * E_) / gQ[Q] / double(nsampleQ);
963 
964                 } // if
965               }   // COLLINEAR
966             }     // Kb
967           }       // Ka
968         }         // nQ
969       }           // n
970     }
971 
972     if (addEJ)
973     {
974       size_t local_memory_needs = 2 * nchol_max * nwalk;
975       if (TMats.num_elements() < local_memory_needs)
976         TMats.reextent({local_memory_needs, 1});
977       cnt = 0;
978       SpMatrix_ref Kr_local(TMats.origin(), {nwalk, nchol_max});
979       cnt += Kr_local.num_elements();
980       SpMatrix_ref Kl_local(TMats.origin() + cnt, {nwalk, nchol_max});
981       cnt += Kl_local.num_elements();
982       std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0));
983       std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0));
984       size_t nqk = 1;
985       for (int Q = 0; Q < nkpts; ++Q)
986       {
987         bool haveKE = false;
988         for (int Ka = 0; Ka < nkpts; ++Ka)
989         {
990           if ((nqk++) % comm->size() == comm->rank())
991           {
992             haveKE    = true;
993             int nchol = ncholpQ[Q];
994             int Qm    = kminus[Q];
995             int Kl    = QKToK2[Qm][Ka];
996             int Kk    = QKToK2[Q][Ka];
997             int nl    = nopk[Kl];
998             int na    = nelpk[nd][Ka];
999             int nk    = nopk[Kk];
1000 
1001             Sp3Tensor_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk, na, nl});
1002             Sp3Tensor_ref Gwbk(GKK[0][Ka][Kk].origin(), {nwalk, na, nk});
1003             Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na, nchol, nk});
1004             SPComplexType* bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Ka].origin()));
1005             if (Q == Qm)
1006               bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Ka].origin());
1007             Sp3Tensor_ref Lbnl(bnl_ptr, {na, nchol, nl});
1008 
1009             // Twan = sum_l G[w][a][l] L[a][n][l]
1010             for (int n = 0; n < nwalk; ++n)
1011               for (int a = 0; a < na; ++a)
1012                 ma::product(SPComplexType(1.0), Lbnl[a], Gwal[n][a], SPComplexType(1.0), Kl_local[n]);
1013             for (int n = 0; n < nwalk; ++n)
1014               for (int a = 0; a < na; ++a)
1015                 ma::product(SPComplexType(1.0), Lank[a], Gwbk[n][a], SPComplexType(1.0), Kr_local[n]);
1016           } // if
1017 
1018           if (walker_type == COLLINEAR)
1019           {
1020             if ((nqk++) % comm->size() == comm->rank())
1021             {
1022               haveKE    = true;
1023               int nchol = ncholpQ[Q];
1024               int Qm    = kminus[Q];
1025               int Kl    = QKToK2[Qm][Ka];
1026               int Kk    = QKToK2[Q][Ka];
1027               int nl    = nopk[Kl];
1028               int na    = nelpk[nd][nkpts + Ka];
1029               int nk    = nopk[Kk];
1030 
1031               Sp3Tensor_ref Gwal(GKK[1][Ka][Kl].origin(), {nwalk, na, nl});
1032               Sp3Tensor_ref Gwbk(GKK[1][Ka][Kk].origin(), {nwalk, na, nk});
1033               Sp3Tensor_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][Ka].origin()), {na, nchol, nk});
1034               auto bnl_ptr(to_address(LQKank[(nd * nspin + 1) * nkpts + Qm][Ka].origin()));
1035               if (Q == Qm)
1036                 bnl_ptr = to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][Ka].origin());
1037               Sp3Tensor_ref Lbnl(bnl_ptr, {na, nchol, nl});
1038 
1039               // Twan = sum_l G[w][a][l] L[a][n][l]
1040               for (int n = 0; n < nwalk; ++n)
1041                 for (int a = 0; a < na; ++a)
1042                   ma::product(SPComplexType(1.0), Lbnl[a], Gwal[n][a], SPComplexType(1.0), Kl_local[n]);
1043               for (int n = 0; n < nwalk; ++n)
1044                 for (int a = 0; a < na; ++a)
1045                   ma::product(SPComplexType(1.0), Lank[a], Gwbk[n][a], SPComplexType(1.0), Kr_local[n]);
1046 
1047             } // if
1048           }   // COLLINEAR
1049         }     // Ka
1050         if (haveKE)
1051         {
1052           std::lock_guard<shared_mutex> guard(*mutex[Q]);
1053           int nc0 = std::accumulate(ncholpQ.begin(), ncholpQ.begin() + Q, 0);
1054           for (int n = 0; n < nwalk; n++)
1055           {
1056             ma::axpy(SPComplexType(1.0), Kr_local[n].sliced(0, ncholpQ[Q]), Kr[n].sliced(nc0, nc0 + ncholpQ[Q]));
1057             ma::axpy(SPComplexType(1.0), Kl_local[n].sliced(0, ncholpQ[Q]), Kl[n].sliced(nc0, nc0 + ncholpQ[Q]));
1058           }
1059         } // to release the lock
1060         if (haveKE)
1061         {
1062           std::fill_n(Kr_local.origin(), Kr_local.num_elements(), SPComplexType(0.0));
1063           std::fill_n(Kl_local.origin(), Kl_local.num_elements(), SPComplexType(0.0));
1064         }
1065       } // Q
1066       comm->barrier();
1067       nqk            = 0;
1068       SPRealType scl = (walker_type == CLOSED ? 2.0 : 1.0);
1069       for (int n = 0; n < nwalk; ++n)
1070       {
1071         for (int Q = 0; Q < nkpts; ++Q)
1072         { // momentum conservation index
1073           if ((nqk++) % comm->size() == comm->rank())
1074           {
1075             int nc0 = std::accumulate(ncholpQ.begin(), ncholpQ.begin() + Q, 0);
1076             E[n][2] += 0.5 *
1077                 static_cast<ComplexType>(scl * scl *
1078                                          ma::dot(Kl[n]({nc0, nc0 + ncholpQ[Q]}), Kr[n]({nc0, nc0 + ncholpQ[Q]})));
1079           }
1080         }
1081       }
1082 #if defined(MIXED_PRECISION)
1083       if (getKl)
1084       {
1085         size_t i0, iN;
1086         std::tie(i0, iN) =
1087             FairDivideBoundary(size_t(comm->rank()), size_t(KEleft->num_elements()), size_t(comm->size()));
1088         copy_n_cast(Klptr + i0, iN - i0, to_address(KEleft->origin()) + i0);
1089       }
1090       if (getKr)
1091       {
1092         size_t i0, iN;
1093         std::tie(i0, iN) =
1094             FairDivideBoundary(size_t(comm->rank()), size_t(KEright->num_elements()), size_t(comm->size()));
1095         copy_n_cast(Krptr + i0, iN - i0, to_address(KEright->origin()) + i0);
1096       }
1097       comm->barrier();
1098 #endif
1099     }
1100   }
1101 
1102   template<class... Args>
1103   void fast_energy(Args&&... args)
1104   {
1105     APP_ABORT(" Error: fast_energy not implemented in KP3IndexFactorization. \n");
1106   }
1107 
1108   template<class MatA,
1109            class MatB,
1110            typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 1)>,
1111            typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 1)>,
1112            typename = void>
1113   void vHS(MatA& X, MatB&& v, double a = 1., double c = 0.)
1114   {
1115     using BType = typename std::decay<MatB>::type::element;
1116     using AType = typename std::decay<MatA>::type::element;
1117     boost::multi::array_ref<BType, 2> v_(to_address(v.origin()), {1, v.size(0)});
1118     boost::multi::array_ref<AType, 2> X_(to_address(X.origin()), {X.size(0), 1});
1119     return vHS(X_, v_, a, c);
1120   }
1121 
1122   template<class MatA,
1123            class MatB,
1124            typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 2)>,
1125            typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 2)>>
1126   void vHS(MatA& Xw, MatB&& v, double a = 1., double c = 0.)
1127   {
1128     int nkpts = nopk.size();
1129     int nwalk = Xw.size(1);
1130     assert(v.size(0) == nwalk);
1131     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
1132     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
1133     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
1134     int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end());
1135     assert(Xw.num_elements() == nwalk * 2 * local_nCV);
1136     assert(v.num_elements() == nwalk * nmo_tot * nmo_tot);
1137     SPComplexType one(1.0, 0.0);
1138     SPComplexType im(0.0, 1.0);
1139     SPComplexType halfa(0.5 * a, 0.0);
1140     size_t local_memory_needs = nmo_max * nmo_max * nwalk;
1141     if (TMats.num_elements() < local_memory_needs)
1142       TMats.reextent({local_memory_needs, 1});
1143 
1144     using vType = typename std::decay<MatB>::type::element;
1145     boost::multi::array_ref<vType, 3> v3D(to_address(v.origin()), {nwalk, nmo_tot, nmo_tot});
1146 
1147     sp_pointer Xptr(nullptr);
1148     // I WANT C++17!!!!!!
1149     using XType = typename std::decay<MatA>::type::element;
1150     if (std::is_same<SPComplexType, XType>::value)
1151     {
1152       Xptr = reinterpret_cast<sp_pointer>(to_address(Xw.origin()));
1153     }
1154     else
1155     {
1156       size_t mem_needs = Xw.num_elements();
1157       set_shm_buffer(mem_needs);
1158       Xptr = to_address(SM_TMats.origin());
1159       {
1160         size_t i0, iN;
1161         std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(Xw.num_elements()), size_t(comm->size()));
1162         copy_n_cast(to_address(Xw.origin()) + i0, iN - i0, Xptr + i0);
1163       }
1164       comm->barrier();
1165     }
1166     SpMatrix_ref X(Xptr, Xw.extensions());
1167 
1168     // "rotate" X
1169     //  XIJ = 0.5*a*(Xn+ -i*Xn-), XJI = 0.5*a*(Xn+ +i*Xn-)
1170     for (int Q = 0; Q < nkpts; ++Q)
1171     {
1172       if (Qmap[Q] < 0)
1173         continue;
1174       int nq = Q2vbias[Q];
1175       int nc0, ncN;
1176       std::tie(nc0, ncN) = FairDivideBoundary(comm->rank(), ncholpQ[Q], comm->size());
1177       auto Xnp           = to_address(X[nq + nc0].origin());
1178       auto Xnm           = to_address(X[nq + ncholpQ[Q] + nc0].origin());
1179       for (int n = nc0; n < ncN; ++n)
1180       {
1181         for (int nw = 0; nw < nwalk; ++nw, ++Xnp, ++Xnm)
1182         {
1183           SPComplexType Xnp_ = halfa * ((*Xnp) - im * (*Xnm));
1184           *Xnm               = halfa * ((*Xnp) + im * (*Xnm));
1185           *Xnp               = Xnp_;
1186         }
1187       }
1188     }
1189     comm->barrier();
1190     //  then combine Q/(-Q) pieces if Q != -Q
1191     //  X(Q)np = (X(Q)np + X(-Q)nm)
1192     for (int Q = 0; Q < nkpts; ++Q)
1193     {
1194       if (Qmap[Q] == 0)
1195       {
1196         int Qm   = kminus[Q];
1197         int nq   = Q2vbias[Q];
1198         int nqm0 = Q2vbias[Qm];
1199         int nc0, ncN;
1200         std::tie(nc0, ncN) = FairDivideBoundary(comm->rank(), ncholpQ[Q], comm->size());
1201         auto Xnp           = to_address(X[nq + nc0].origin());
1202         auto Xnm           = to_address(X[nqm0 + ncholpQ[Q] + nc0].origin());
1203         for (int n = nc0; n < ncN; ++n)
1204           for (int nw = 0; nw < nwalk; ++nw, ++Xnp, ++Xnm)
1205             *Xnp = ((*Xnp) + (*Xnm));
1206       }
1207     }
1208     // scale v by 'c': assuming contiguous data
1209     {
1210       size_t i0, iN;
1211       std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(v.num_elements()), size_t(comm->size()));
1212       auto v_          = to_address(v.origin()) + i0;
1213       for (size_t i = i0; i < iN; ++i, ++v_)
1214         *v_ *= c;
1215     }
1216     comm->barrier();
1217 
1218     using ma::H;
1219     using ma::T;
1220     size_t nqk = 0;
1221     for (int Q = 0; Q < nkpts; ++Q)
1222     { // momentum conservation index
1223       if (Qmap[Q] < 0)
1224         continue;
1225       int nc0 = Q2vbias[Q];
1226       for (int K = 0; K < nkpts; ++K)
1227       { // K is the index of the kpoint pair of (i,k)
1228         if ((nqk++) % comm->size() == comm->rank())
1229         {
1230           int nchol = ncholpQ[Q];
1231           int QK    = QKToK2[Q][K];
1232           int ni    = nopk[K];
1233           int ni0   = std::accumulate(nopk.begin(), nopk.begin() + K, 0);
1234           int nk    = nopk[QK];
1235           int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QK, 0);
1236 
1237 
1238           // v[nw][i(in K)][k(in Q(K))] += sum_n LQK[i][k][n] X[Q][n+][nw]
1239           if (Q <= kminus[Q])
1240           {
1241             SpMatrix_ref vik(TMats.origin(), {nwalk, ni * nk});
1242             Sp3Tensor_ref vik3D(TMats.origin(), {nwalk, ni, nk});
1243             SpMatrix_ref Likn(to_address(LQKikn[Q][K].origin()), {ni * nk, nchol});
1244             ma::product(T(X.sliced(nc0, nc0 + nchol)), T(Likn), vik);
1245             for (int nw = 0; nw < nwalk; nw++)
1246               for (int i = 0; i < ni; i++)
1247                 ma::axpy(one, vik3D[nw][i], v3D[nw][ni0 + i].sliced(nk0, nk0 + nk));
1248           }
1249           else
1250           { // use L(-Q)(Kk)*
1251             SpMatrix_ref vki(TMats.origin(), {nwalk, nk * ni});
1252             Sp3Tensor_ref vki3D(TMats.origin(), {nwalk, nk, ni});
1253             SpMatrix_ref Likn(to_address(LQKikn[kminus[Q]][QK].origin()), {nk * ni, nchol});
1254             ma::product(T(X.sliced(nc0, nc0 + nchol)), H(Likn), vki);
1255             for (int nw = 0; nw < nwalk; nw++)
1256             {
1257               const auto&& vki_n(vki3D[nw]);
1258               for (int i = 0; i < ni; i++)
1259               {
1260                 auto v3D_ni(to_address(v3D[nw][ni0 + i].origin()) + nk0);
1261                 for (int k = 0; k < nk; k++, ++v3D_ni)
1262                   *v3D_ni += static_cast<vType>(vki_n[k][i]);
1263               }
1264             }
1265           }
1266         }
1267       }
1268     }
1269     comm->barrier();
1270     // adding second half when Q==(-Q)
1271     nqk = 0;
1272     for (int Q = 0; Q < nkpts; ++Q)
1273     {
1274       if (Qmap[Q] > 0)
1275       { // rho(Q)^+ term
1276         int nc0 = Q2vbias[Q];
1277         for (int K = 0; K < nkpts; ++K)
1278         { // K is the index of the kpoint pair of (i,k)
1279           if ((nqk++) % comm->size() == comm->rank())
1280           {
1281             int nchol = ncholpQ[Q];
1282             int ni    = nopk[K];
1283             int ni0   = std::accumulate(nopk.begin(), nopk.begin() + K, 0);
1284             int nk    = nopk[QKToK2[Q][K]];
1285             int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
1286             SpMatrix_ref Likn(to_address(LQKikn[Q][K].origin()), {ni * nk, nchol});
1287             SpMatrix_ref vik(TMats.origin(), {nwalk, ni * nk});
1288             Sp3Tensor_ref vik3D(TMats.origin(), {nwalk, ni, nk});
1289             // v[nw][k(in Q(K))][i(in K)] += sum_n conj(LQK[i][k][n]) X[Q][n-][nw]
1290             ma::product(T(X.sliced(nc0 + nchol, nc0 + 2 * nchol)), H(Likn), vik);
1291             for (int nw = 0; nw < nwalk; nw++)
1292             {
1293               const auto&& vik3D_n = vik3D[nw];
1294               for (int k = 0; k < nk; k++)
1295               {
1296                 auto v3D_nk = to_address(v3D[nw][nk0 + k].origin()) + ni0;
1297                 for (int i = 0; i < ni; i++, ++v3D_nk)
1298                   *v3D_nk += static_cast<vType>(vik3D_n[i][k]);
1299               }
1300             }
1301           }
1302         }
1303       }
1304     }
1305     // do I need to "rotate" back, can be done if necessary
1306     comm->barrier();
1307   }
1308 
1309   template<class MatA,
1310            class MatB,
1311            typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 1)>,
1312            typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 1)>,
1313            typename = void>
1314   void vbias(const MatA& G, MatB&& v, double a = 1., double c = 0., int k = 0)
1315   {
1316     using BType = typename std::decay<MatB>::type::element;
1317     using AType = typename std::decay<MatA>::type::element;
1318     boost::multi::array_ref<BType, 2> v_(to_address(v.origin()), {v.size(0), 1});
1319     boost::multi::array_cref<AType, 2> G_(to_address(G.origin()), {G.size(0), 1});
1320     return vbias(G_, v_, a, c, k);
1321   }
1322 
1323   template<class MatA,
1324            class MatB,
1325            typename = typename std::enable_if_t<(std::decay<MatA>::type::dimensionality == 2)>,
1326            typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 2)>>
1327   void vbias(const MatA& Gw, MatB&& v, double a = 1., double c = 0., int nd = 0)
1328   {
1329     using std::copy_n;
1330     using GType = typename std::decay_t<typename MatA::element>;
1331     using vType = typename std::decay<MatB>::type::element;
1332     int nkpts   = nopk.size();
1333     assert(nd >= 0 && nd < nelpk.size());
1334     int nwalk = Gw.size(1);
1335     assert(v.size(0) == 2 * local_nCV);
1336     assert(v.size(1) == nwalk);
1337     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
1338     int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
1339     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
1340     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
1341     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
1342     int nocca_max = *std::max_element(nelpk[nd].begin(), nelpk[nd].begin() + nkpts);
1343     int noccb_max = nocca_max;
1344     int nchol_max = *std::max_element(ncholpQ.begin(), ncholpQ.end());
1345     int noccb_tot = 0;
1346     if (walker_type == COLLINEAR)
1347     {
1348       noccb_tot = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts, 0);
1349       noccb_max = *std::max_element(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + 2 * nkpts);
1350     }
1351     RealType scl = (walker_type == CLOSED ? 2.0 : 1.0);
1352     SPComplexType one(1.0, 0.0);
1353     SPComplexType halfa(0.5 * a * scl, 0.0);
1354     SPComplexType minusimhalfa(0.0, -0.5 * a * scl);
1355     SPComplexType imhalfa(0.0, 0.5 * a * scl);
1356     size_t local_memory_needs = 2 * nchol_max * nwalk;
1357     if (walker_type == NONCOLLINEAR)
1358       local_memory_needs += nmo_max * npol * nwalk; // for transposed G
1359     if (TMats.num_elements() < local_memory_needs)
1360       TMats.reextent({local_memory_needs, 1});
1361     SpMatrix_ref vlocal(TMats.origin(), {2 * nchol_max, nwalk});
1362     std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0));
1363 
1364     assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
1365     const_sp_pointer Gptr(nullptr);
1366     // I WANT C++17!!!!!!
1367     if (std::is_same<SPComplexType, GType>::value)
1368     {
1369       Gptr = reinterpret_cast<const_sp_pointer>(to_address(Gw.origin()));
1370     }
1371     else
1372     {
1373       size_t mem_needs = Gw.num_elements();
1374       set_shm_buffer(mem_needs);
1375       {
1376         size_t i0, iN;
1377         std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(Gw.num_elements()), size_t(comm->size()));
1378         copy_n_cast(to_address(Gw.origin()) + i0, iN - i0, to_address(SM_TMats.origin()) + i0);
1379       }
1380       Gptr = to_address(SM_TMats.origin());
1381       comm->barrier();
1382     }
1383 
1384     boost::multi::array_cref<SPComplexType, 4> G3Da(Gptr, {nocca_tot, npol, nmo_tot, nwalk});
1385     boost::multi::array_cref<SPComplexType, 3> G3Db(Gptr + G3Da.num_elements() * (nspin - 1),
1386                                                     {noccb_tot, nmo_tot, nwalk});
1387 
1388     {
1389       size_t i0, iN;
1390       std::tie(i0, iN) = FairDivideBoundary(size_t(comm->rank()), size_t(v.size(0)), size_t(comm->size()));
1391       for (size_t i = i0; i < iN; ++i)
1392         ma::scal(c, v[i]);
1393     }
1394     comm->barrier();
1395 
1396     size_t nqk = 0;
1397     for (int K = 0; K < nkpts; ++K)
1398     { // K is the index of the kpoint pair of (a,k)
1399       for (int Q = 0; Q < nkpts; ++Q)
1400       { // momentum conservation index
1401         if (Qmap[Q] < 0)
1402           continue;
1403         bool haveV = false;
1404         if ((nqk++) % comm->size() == comm->rank())
1405         {
1406           haveV     = true;
1407           int nchol = ncholpQ[Q];
1408           int na    = nelpk[nd][K];
1409           int na0   = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + K, 0);
1410           int nk    = nopk[QKToK2[Q][K]];
1411           int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
1412           auto&& v1 = vlocal({0, nchol}, {0, nwalk});
1413 
1414           if (walker_type == NONCOLLINEAR)
1415           {
1416             Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, npol * nk});
1417             // v1[Q][n][nw] += sum_K sum_a_p_k LQK[a][n][p][k] G[a][p][k][nw]
1418             for (int a = 0; a < na; ++a)
1419             {
1420               SpMatrix_ref Ga(to_address(vlocal.origin()) + vlocal.num_elements(), {npol * nk, nwalk});
1421               SpMatrix_ref Ga_(Ga.origin(), {npol, nk * nwalk});
1422               for (int p = 0; p < npol; p++)
1423                 copy_n(to_address(G3Da[na0 + a][p][nk0].origin()), nk * nwalk, Ga_[p].origin());
1424               ma::product(one, Lank[a], Ga, one, v1);
1425             }
1426           }
1427           else
1428           {
1429             Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, nk});
1430 
1431             // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw]
1432             for (int a = 0; a < na; ++a)
1433               ma::product(one, Lank[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
1434           }
1435         }
1436         if (walker_type == COLLINEAR)
1437         {
1438           if ((nqk++) % comm->size() == comm->rank())
1439           {
1440             haveV     = true;
1441             int nchol = ncholpQ[Q];
1442             int na    = nelpk[nd][nkpts + K];
1443             int na0   = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + nkpts + K, 0);
1444             int nk    = nopk[QKToK2[Q][K]];
1445             int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
1446             auto&& v1 = vlocal({0, nchol}, {0, nwalk});
1447 
1448             Sp3Tensor_ref Lank(to_address(LQKank[(nd * nspin + 1) * nkpts + Q][K].origin()), {na, nchol, nk});
1449 
1450             // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw]
1451             for (int a = 0; a < na; ++a)
1452               ma::product(one, Lank[a], G3Db[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
1453           }
1454         }
1455         if (haveV)
1456         {
1457           {
1458             std::lock_guard<shared_mutex> guard(*mutex[Q]);
1459             int nc0 = Q2vbias[Q];
1460             // v+ = 0.5*a*(v1+v2)
1461             ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0, nc0 + ncholpQ[Q]));
1462             // v- = -0.5*a*i*(v1-v2)
1463             ma::axpy(minusimhalfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0 + ncholpQ[Q], nc0 + 2 * ncholpQ[Q]));
1464           }
1465           int Qm  = kminus[Q];
1466           int nc0 = Q2vbias[Qm];
1467           if (Qmap[Q] == 0)
1468           {
1469             std::lock_guard<shared_mutex> guard(*mutex[Qm]);
1470             // v+ = 0.5*a*(v1+v2)
1471             ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Qm]), v.sliced(nc0, nc0 + ncholpQ[Qm]));
1472             // v- = -0.5*a*i*(v1-v2)
1473             ma::axpy(imhalfa, vlocal.sliced(0, ncholpQ[Qm]), v.sliced(nc0 + ncholpQ[Qm], nc0 + 2 * ncholpQ[Qm]));
1474           }
1475         } // to release the lock
1476         if (haveV)
1477           std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0));
1478       }
1479     }
1480     // add second contribution when Q==(-Q)
1481     for (int Q = 0; Q < nkpts; ++Q)
1482     {
1483       if (Qmap[Q] > 0)
1484       { // rho(Q)^+ term
1485         for (int K = 0; K < nkpts; ++K)
1486         { // K is the index of the kpoint pair of (a,k)
1487           bool haveV = false;
1488           if ((nqk++) % comm->size() == comm->rank())
1489           {
1490             haveV     = true;
1491             int nchol = ncholpQ[Q];
1492             int na    = nelpk[nd][K];
1493             int na0   = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + K, 0);
1494             int nk    = nopk[QKToK2[Q][K]];
1495             int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
1496             auto&& v1 = vlocal({0, nchol}, {0, nwalk});
1497 
1498             if (walker_type == NONCOLLINEAR)
1499             {
1500               Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
1501                                  {na, nchol, npol * nk});
1502 
1503               // v1[Q][n][nw] += sum_K sum_a_s_k LQK[b][n][s][l] G[b][s][l][nw]
1504               for (int a = 0; a < na; ++a)
1505               {
1506                 SpMatrix_ref Ga(to_address(vlocal.origin()) + vlocal.num_elements(), {npol * nk, nwalk});
1507                 SpMatrix_ref Ga_(Ga.origin(), {npol, nk * nwalk});
1508                 for (int p = 0; p < npol; p++)
1509                   copy_n(to_address(G3Da[na0 + a][p][nk0].origin()), nk * nwalk, Ga_[p].origin());
1510                 ma::product(one, Lbnl[a], Ga, one, v1);
1511               }
1512             }
1513             else
1514             {
1515               Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
1516                                  {na, nchol, nk});
1517 
1518               // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw]
1519               for (int a = 0; a < na; ++a)
1520                 ma::product(one, Lbnl[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
1521             }
1522           }
1523           if (walker_type == COLLINEAR)
1524           {
1525             if ((nqk++) % comm->size() == comm->rank())
1526             {
1527               haveV     = true;
1528               int nchol = ncholpQ[Q];
1529               int na    = nelpk[nd][nkpts + K];
1530               int na0   = std::accumulate(nelpk[nd].begin() + nkpts, nelpk[nd].begin() + nkpts + K, 0);
1531               int nk    = nopk[QKToK2[Q][K]];
1532               int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
1533               auto&& v1 = vlocal({0, nchol}, {0, nwalk});
1534 
1535               Sp3Tensor_ref Lbnl(to_address(LQKbnl[(nd * nspin + 1) * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
1536                                  {na, nchol, nk});
1537 
1538               // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw]
1539               for (int a = 0; a < na; ++a)
1540                 ma::product(one, Lbnl[a], G3Db[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
1541             }
1542           }
1543           if (haveV)
1544           {
1545             {
1546               std::lock_guard<shared_mutex> guard(*mutex[Q]);
1547               int nc0 = Q2vbias[Q];
1548               // v+ = 0.5*a*(v1+v2)
1549               ma::axpy(halfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0, nc0 + ncholpQ[Q]));
1550               // v- = -0.5*a*i*(v1-v2)
1551               ma::axpy(imhalfa, vlocal.sliced(0, ncholpQ[Q]), v.sliced(nc0 + ncholpQ[Q], nc0 + 2 * ncholpQ[Q]));
1552             }
1553           } // to release the lock
1554           if (haveV)
1555             std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0));
1556         }
1557       }
1558     }
1559     comm->barrier();
1560   }
1561 
1562   template<class Mat, class MatB>
1563   void generalizedFockMatrix(Mat&& G, MatB&& Fp, MatB&& Fm)
1564   {
1565     APP_ABORT(" Error: generalizedFockMatrix not implemented for this hamiltonian.\n");
1566   }
1567 
1568   bool distribution_over_cholesky_vectors() const { return true; }
1569   int number_of_ke_vectors() const { return local_nCV; }
1570   int local_number_of_cholesky_vectors() const { return 2 * local_nCV; }
1571   int global_number_of_cholesky_vectors() const { return global_nCV; }
1572   int global_origin_cholesky_vector() const { return global_origin; }
1573 
1574   // transpose=true means G[nwalk][ik], false means G[ik][nwalk]
1575   bool transposed_G_for_vbias() const { return false; }
1576   bool transposed_G_for_E() const { return false; }
1577   // transpose=true means vHS[nwalk][ik], false means vHS[ik][nwalk]
1578   bool transposed_vHS() const { return true; }
1579 
1580   bool fast_ph_energy() const { return false; }
1581 
1582   boost::multi::array<ComplexType, 2> getHSPotentials() { return boost::multi::array<ComplexType, 2>{}; }
1583 
1584 private:
1585   communicator* comm;
1586 
1587   WALKER_TYPES walker_type;
1588 
1589   int global_origin;
1590   int global_nCV;
1591   int local_nCV;
1592 
1593   ValueType E0;
1594 
1595   // bare one body hamiltonian
1596   shmC3Tensor H1;
1597 
1598   // (potentially half rotated) one body hamiltonian
1599   shmCMatrix haj;
1600 
1601   // number of orbitals per k-point
1602   IVector nopk;
1603 
1604   // number of cholesky vectors per Q-point
1605   IVector ncholpQ;
1606 
1607   // position of (-K) in kp-list for every K
1608   IVector kminus;
1609 
1610   // number of electrons per k-point
1611   // nelpk[ndet][nspin*nkpts]
1612   shmIMatrix nelpk;
1613 
1614   // maps (Q,K) --> k2
1615   shmIMatrix QKToK2;
1616 
1617   //Cholesky Tensor Lik[Q][nk][i][k][n]
1618   std::vector<shmSpMatrix> LQKikn;
1619 
1620   // half-tranformed Cholesky tensor
1621   std::vector<shmSpMatrix> LQKank;
1622 
1623   // half-tranformed Cholesky tensor
1624   std::vector<shmSpMatrix> LQKbnl;
1625 
1626   // Defines behavior over Q vector:
1627   //   <0: Ignore (handled by another TG)
1628   //    0: Calculate, without rho^+ contribution
1629   //   >0: Calculate, with rho^+ contribution. LQKbln data located at Qmap[Q]-1
1630   IVector Qmap;
1631   IVector Q2vbias;
1632   int number_of_symmetric_Q;
1633 
1634   // one-body piece of Hamiltonian factorization
1635   shmC3Tensor vn0;
1636 
1637   int nsampleQ;
1638   std::vector<RealType> gQ;
1639   shmIMatrix Qwn;
1640   std::default_random_engine generator;
1641   std::discrete_distribution<int> distribution;
1642 
1643   // shared buffer space
1644   // using matrix since there are issues with vectors
1645   shmSpMatrix SM_TMats;
1646   SpMatrix TMats;
1647 
1648   std::vector<std::unique_ptr<shared_mutex>> mutex;
1649 
1650   //    boost::multi::array<ComplexType,3> Qave;
1651   //    int cntQave=0;
1652   std::vector<ComplexType> EQ;
1653   //    std::default_random_engine generator;
1654   //    std::uniform_real_distribution<RealType> distribution(RealType(0.0),Realtype(1.0));
1655 
1656   myTimer Timer;
1657 
1658   void set_shm_buffer(size_t N)
1659   {
1660     if (SM_TMats.num_elements() < N)
1661       SM_TMats.reextent({N, 1});
1662   }
1663 
1664   template<class MatA, class MatB>
1665   void GKaKjw_to_GKKwaj(int nd, MatA const& GKaKj, MatB&& GKKaj, int nocca_tot, int noccb_tot, int nmo_tot, int akmax)
1666   {
1667     int nspin = (walker_type == COLLINEAR ? 2 : 1);
1668     int npol  = (walker_type == NONCOLLINEAR ? 2 : 1);
1669     int nwalk = GKaKj.size(1);
1670     int nkpts = nopk.size();
1671     assert(GKaKj.num_elements() == (nocca_tot + noccb_tot) * npol * nmo_tot * nwalk);
1672     assert(GKKaj.num_elements() == nspin * nkpts * nkpts * npol * akmax * nwalk);
1673     boost::multi::array_cref<ComplexType, 4> Gca(to_address(GKaKj.origin()), {nocca_tot, npol, nmo_tot, nwalk});
1674     boost::multi::array_cref<ComplexType, 3> Gcb(to_address(GKaKj.origin()) + Gca.num_elements(),
1675                                                  {noccb_tot, nmo_tot, nwalk});
1676     boost::multi::array_ref<SPComplexType, 4> GKK(to_address(GKKaj.origin()),
1677                                                   {nspin, nkpts, nkpts, nwalk * npol * akmax});
1678     int na0 = 0;
1679     for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++)
1680     {
1681       int na  = nelpk[nd][Ka];
1682       int nj0 = 0;
1683       for (int Kj = 0; Kj < nkpts; Kj++, Kaj++)
1684       {
1685         int nj = nopk[Kj];
1686         if (Kaj % comm->size() != comm->rank())
1687         {
1688           nj0 += nj;
1689           continue;
1690         }
1691         auto G_(to_address(GKK[0][Ka][Kj].origin()));
1692         int naj = na * nj * npol;
1693         for (int a = 0, asj = 0; a < na; a++)
1694         {
1695           for (int p = 0; p < npol; p++)
1696           {
1697             auto Gc_(to_address(Gca[na0 + a][p][nj0].origin()));
1698             for (int j = 0; j < nj; j++, asj++)
1699             {
1700               for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
1701 #ifdef MIXED_PRECISION
1702                 G_[waj + asj] = static_cast<SPComplexType>(*Gc_);
1703 #else
1704                 G_[waj + asj] = (*Gc_);
1705 #endif
1706             }
1707           }
1708         }
1709         nj0 += nj;
1710       }
1711       na0 += na;
1712     }
1713     if (nspin > 1)
1714     {
1715       na0 = 0;
1716       for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++)
1717       {
1718         int na  = nelpk[nd][nkpts + Ka];
1719         int nj0 = 0;
1720         for (int Kj = 0; Kj < nkpts; Kj++, Kaj++)
1721         {
1722           int nj = nopk[Kj];
1723           if (Kaj % comm->size() != comm->rank())
1724           {
1725             nj0 += nj;
1726             continue;
1727           }
1728           auto G_(to_address(GKK[1][Ka][Kj].origin()));
1729           int naj = na * nj;
1730           for (int a = 0, aj = 0; a < na; a++)
1731           {
1732             auto Gc_(to_address(Gcb[na0 + a][nj0].origin()));
1733             for (int j = 0; j < nj; j++, aj++)
1734             {
1735               for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
1736 #ifdef MIXED_PRECISION
1737                 G_[waj + aj] = static_cast<SPComplexType>(*Gc_);
1738 #else
1739                 G_[waj + aj] = (*Gc_);
1740 #endif
1741             }
1742           }
1743           nj0 += nj;
1744         }
1745         na0 += na;
1746       }
1747     }
1748     comm->barrier();
1749   }
1750 
1751   // for testing purposes only
1752   template<class MatA, class MatB>
1753   void GwAK_to_GAKw(MatA const& GwAK, MatB&& GAKw)
1754   {
1755     int nwalk = GwAK.size(0);
1756     int nAK   = GwAK.size(1);
1757     for (int w = 0; w < nwalk; w++)
1758       for (int AK = 0; AK < nAK; AK++)
1759         GAKw[AK][w] = GwAK[w][AK];
1760   }
1761 };
1762 
1763 } // namespace afqmc
1764 
1765 } // namespace qmcplusplus
1766 
1767 #endif
1768