1 ////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 //    Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 //    Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15 
16 
17 #ifndef AFQMC_ROTATE_HPP
18 #define AFQMC_ROTATE_HPP
19 
20 #include <numeric>
21 #include "AFQMC/config.h"
22 #include "Utilities/FairDivide.h"
23 #include "AFQMC/Utilities/taskgroup.h"
24 #include "AFQMC/Matrix/csr_matrix.hpp"
25 #include "AFQMC/Numerics/ma_operations.hpp"
26 #include "AFQMC/Numerics/csr_blas.hpp"
27 #include "AFQMC/Matrix/matrix_emplace_wrapper.hpp"
28 #include "multi/array_ref.hpp"
29 #include "AFQMC/Utilities/afqmc_TTI.hpp"
30 #include "mpi.h"
31 
32 namespace qmcplusplus
33 {
34 namespace afqmc
35 {
36 namespace sparse_rotate
37 {
38 /*
39  *  Performs a (left) half rotation (and a possible transposition) of a Cholesky matrix.
40  *  The rotated matrix is sotored in "tuple" form in the provided container using emplace_back.
41  *  The generation of the matrix is distributed among the containers
42  *  of all the cores in the node.
43  *  Input:
44  *    -alpha: Sparse hermitian of rotation matrix for spin up, e.g. transpose(conj(Aup)).
45  *    -beta: Sparse hermitian of rotation matrix for spin down, e.g. transpose(conj(Aup)).
46  *    -A: Input sparse cholesky matrix.
47  *    -cutoff: Value below which elements of rotated matrix are ignored.
48  *  Output:
49  *    -B: Container (e.g. std::vector) with a segment of the non-zero terms of the rotated matrix.
50  *        If transpose==false, the elements in the container are ordered by (a,k) values.
51  *
52  *  If transposed==true:
53  *     B(n,ka-ka0) = sum_i^M alpha(a,i) * Spvn(ik,n)
54  *     B(n,ka+N*M-ka0) = sum_i^M beta(a,i) * Spvn(ki,n)
55  *  else:
56  *     B(ka-ka0,n) = sum_i A(a,i) * Spvn(ik,n)
57  *     B(ka+N*M-ka0,n) = sum_i^M beta(a,i) * Spvn(ik,n),
58  *  where M/N is the number of rows/columns of alpha and beta.
59  *  The number of rows of Spvn should be equal to M*M.
60  *  If conjV=true, Spvn(ik,n) -> conj(Spvn(ki,n)) in the expressions above
61  */
62 template<class Container, class task_group, class PsiT_Type>
halfRotateCholeskyMatrix(WALKER_TYPES type,task_group & TG,int k0,int kN,Container & Q,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,bool transpose,bool conjV=false,double cutoff=1e-6,bool reserve_to_fit_=true)63 void halfRotateCholeskyMatrix(WALKER_TYPES type,
64                               task_group& TG,
65                               int k0,
66                               int kN,
67                               Container& Q,
68                               PsiT_Type* Alpha,
69                               PsiT_Type* Beta,
70                               SpVType_shm_csr_matrix const& CholMat,
71                               bool transpose,
72                               bool conjV           = false,
73                               double cutoff        = 1e-6,
74                               bool reserve_to_fit_ = true)
75 {
76   int NAEA = Alpha->size(0);
77   int NAEB = Alpha->size(0);
78   int NMO  = Alpha->size(1);
79   if (type == COLLINEAR)
80     NAEB = Beta->size(0);
81   int nvec   = CholMat.size(1);
82   int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
83 
84   assert(CholMat.size(0) == NMO * NMO);
85   assert(kN > k0);
86   if (type == CLOSED && kN > NMO)
87     APP_ABORT(" Error: kN > NMO in halfRotateCholeskyMatrix. \n");
88 
89   // map from [0:2*NMO) to [0:NMO) in collinear case
90   int k0_alpha = 0, k0_beta = 0, kN_alpha = 0, kN_beta = 0;
91   if (k0 >= 0)
92   {
93     k0_alpha = std::min(k0, NMO);
94     kN_alpha = std::min(kN, NMO);
95     if (type == COLLINEAR)
96     {
97       kN_beta = std::max(kN, NMO) - NMO;
98       k0_beta = std::max(k0, NMO) - NMO;
99     }
100   }
101 
102   int ak0, ak1;
103   int Qdim = NAEA * (kN_alpha - k0_alpha) + NAEB * (kN_beta - k0_beta);
104   if (transpose)
105   {
106     //if(not check_shape(Q,{nvec,Qdim}))
107     if (not(Q.size(0) == nvec && Q.size(1) == Qdim))
108       APP_ABORT(" Error: Container Q has incorrect dimensions in halfRotateCholeskyMatrix. \n");
109   }
110   else
111   {
112     //if(not check_shape(Q,{Qdim,nvec}))
113     if (not(Q.size(0) == Qdim && Q.size(1) == nvec))
114       APP_ABORT(" Error: Container Q has incorrect dimensions in halfRotateCholeskyMatrix. \n");
115   }
116   std::tie(ak0, ak1) = FairDivideBoundary(coreid, Qdim, ncores);
117 
118   if (type == NONCOLLINEAR)
119     APP_ABORT(" GHF not yet implemented. \n");
120 
121   boost::multi::array<SPComplexType, 1> vec(iextensions<1u>{nvec});
122   if (reserve_to_fit_)
123   {
124     std::vector<std::size_t> sz_per_row(Qdim);
125     int cnt = 0;
126     for (int k = k0_alpha; k < kN_alpha; k++)
127     {
128       for (int a = 0; a < NAEA; a++, cnt++)
129       {
130         if (cnt < ak0)
131           continue;
132         if (cnt >= ak1)
133           break;
134         std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
135         auto Aa = (*Alpha)[a];
136         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
137         {
138           auto Aai = Aa.non_zero_values_data()[ip];
139           auto i   = Aa.non_zero_indices2_data()[ip];
140           if (conjV)
141             csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
142           else
143             csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
144         }
145         if (transpose)
146         {
147           for (int n = 0; n < nvec; n++)
148             if (std::abs(vec[n]) > cutoff)
149               ++sz_per_row[n];
150         }
151         else
152         {
153           for (int n = 0; n < nvec; n++)
154             if (std::abs(vec[n]) > cutoff)
155               ++sz_per_row[cnt];
156         }
157       }
158     }
159     // reset "amount of work done" to full alpha piece
160     cnt = NAEA * (kN_alpha - k0_alpha);
161     if (type == COLLINEAR)
162     {
163       // reset "shift"
164       for (int k = k0_beta; k < kN_beta; k++)
165       {
166         for (int a = 0; a < NAEB; a++, cnt++)
167         {
168           if (cnt < ak0)
169             continue;
170           if (cnt >= ak1)
171             break;
172           std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
173           auto Aa = (*Beta)[a];
174           for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
175           {
176             auto Aai = Aa.non_zero_values_data()[ip];
177             auto i   = Aa.non_zero_indices2_data()[ip];
178             if (conjV)
179               csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
180             else
181               csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
182           }
183           if (transpose)
184           {
185             for (int n = 0; n < nvec; n++)
186               if (std::abs(vec[n]) > cutoff)
187                 ++sz_per_row[n];
188           }
189           else
190           {
191             for (int n = 0; n < nvec; n++)
192               if (std::abs(vec[n]) > cutoff)
193                 ++sz_per_row[cnt];
194           }
195         }
196       }
197     }
198     TG.Node().all_reduce_in_place_n(sz_per_row.begin(), sz_per_row.size(), std::plus<>());
199     reserve_to_fit(Q, sz_per_row);
200   } // else
201   //  Q.reserve( std::size_t(0.1 * nvec * NEL * NMO / ncores ) ); // by default, assume 10% sparsity
202 
203   int cnt = 0;
204   for (int k = k0_alpha; k < kN_alpha; k++)
205   {
206     for (int a = 0; a < NAEA; a++, cnt++)
207     {
208       if (cnt < ak0)
209         continue;
210       if (cnt >= ak1)
211         break;
212       std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
213       auto Aa = (*Alpha)[a];
214       for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
215       {
216         auto Aai = Aa.non_zero_values_data()[ip];
217         auto i   = Aa.non_zero_indices2_data()[ip];
218         if (conjV)
219           csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
220         else
221           csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
222       }
223       if (transpose)
224       {
225         for (int n = 0; n < nvec; n++)
226           if (std::abs(vec[n]) > cutoff)
227             emplace(Q, std::forward_as_tuple(n, cnt, vec[n]));
228       }
229       else
230       {
231         for (int n = 0; n < nvec; n++)
232           if (std::abs(vec[n]) > cutoff)
233             emplace(Q, std::forward_as_tuple(cnt, n, vec[n]));
234       }
235     }
236   }
237   // reset "amount of work done" to full alpha piece
238   cnt = NAEA * (kN_alpha - k0_alpha);
239   if (type == COLLINEAR)
240   {
241     // reset "shift"
242     for (int k = k0_beta; k < kN_beta; k++)
243     {
244       for (int a = 0; a < NAEB; a++, cnt++)
245       {
246         if (cnt < ak0)
247           continue;
248         if (cnt >= ak1)
249           break;
250         std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
251         auto Aa = (*Beta)[a];
252         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
253         {
254           auto Aai = Aa.non_zero_values_data()[ip];
255           auto i   = Aa.non_zero_indices2_data()[ip];
256           if (conjV)
257             csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
258           else
259             csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
260         }
261         if (transpose)
262         {
263           for (int n = 0; n < nvec; n++)
264             if (std::abs(vec[n]) > cutoff)
265               emplace(Q, std::forward_as_tuple(n, cnt, vec[n]));
266         }
267         else
268         {
269           for (int n = 0; n < nvec; n++)
270             if (std::abs(vec[n]) > cutoff)
271               emplace(Q, std::forward_as_tuple(cnt, n, vec[n]));
272         }
273       }
274     }
275   }
276 }
277 
278 /*
279  * Calculates the rotated Cholesky matrix used in the calculation of the vias potential.
280  *     v(n,ak) = sum_i A(a,i) * Spvn(ik,n)     A(a,i)=PsiT(i,a)*
281  *     v(n,N*M+ak) = sum_i^M beta(a,i) * Spvn(ik,n),
282  *  where M/N is the number of rows/columns of alpha and beta.
283  *  The number of rows of Spvn should be equal to M*M.
284  *  Since we only rotate the "local" cholesky matrix, the algorithm is only parallelized
285  *  over a node. In principle, it is possible to spread this over equivalent nodes.
286  */
287 template<typename T,
288          class task_group,
289          class PsiT_Type,
290          typename = typename std::enable_if_t<std::is_same<T, std::complex<double>>::value or
291                                               std::is_same<T, std::complex<float>>::value>>
halfRotateCholeskyMatrixForBias(WALKER_TYPES type,task_group & TG,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,double cutoff=1e-6)292 SpCType_shm_csr_matrix halfRotateCholeskyMatrixForBias(WALKER_TYPES type,
293                                                        task_group& TG,
294                                                        PsiT_Type* Alpha,
295                                                        PsiT_Type* Beta,
296                                                        SpVType_shm_csr_matrix const& CholMat,
297                                                        double cutoff = 1e-6)
298 {
299   int NAEA = Alpha->size(0);
300   int NAEB = Alpha->size(0);
301   int NMO  = Alpha->size(1);
302   if (type != CLOSED)
303     NAEB = Beta->size(0);
304   int nvec   = CholMat.size(1);
305   int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
306 
307   // to speed up, generate new communicator for eqv_nodes and split full work among all
308   // cores in this comm. Then build from distributed container?
309 
310   assert(CholMat.size(0) == NMO * NMO);
311 
312   std::size_t Qdim = NAEA * NMO;
313   if (type == COLLINEAR)
314     Qdim += NAEB * NMO;
315   if (type == NONCOLLINEAR)
316     Qdim = 2 * NMO * (NAEA + NAEB);
317   std::size_t ak0, ak1;
318   std::tie(ak0, ak1) = FairDivideBoundary(std::size_t(coreid), Qdim, std::size_t(ncores));
319 
320   if (type == NONCOLLINEAR)
321     APP_ABORT(" GHF not yet implemented. \n");
322 
323   boost::multi::array<SPComplexType, 1> vec(iextensions<1u>{nvec});
324   std::vector<std::size_t> sz_per_row(nvec);
325   std::size_t cnt = 0;
326   for (int a = 0; a < NAEA; a++)
327   {
328     for (int k = 0; k < NMO; k++, cnt++)
329     {
330       if (cnt < ak0)
331         continue;
332       if (cnt >= ak1)
333         break;
334       std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
335       auto Aa = (*Alpha)[a];
336       for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
337       {
338         auto Aai = Aa.non_zero_values_data()[ip];
339         auto i   = Aa.non_zero_indices2_data()[ip];
340         csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
341       }
342       for (std::size_t n = 0; n < nvec; n++)
343         if (std::abs(vec[n]) > cutoff)
344           ++(sz_per_row[n]);
345     }
346   }
347 
348   // reset "amount of work done" to full alpha piece
349   cnt = NAEA * NMO;
350   if (type == COLLINEAR)
351   {
352     // reset "shift"
353     for (int a = 0; a < NAEB; a++)
354     {
355       for (int k = 0; k < NMO; k++, cnt++)
356       {
357         if (cnt < ak0)
358           continue;
359         if (cnt >= ak1)
360           break;
361         std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
362         auto Aa = (*Beta)[a];
363         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
364         {
365           auto Aai = Aa.non_zero_values_data()[ip];
366           auto i   = Aa.non_zero_indices2_data()[ip];
367           csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
368         }
369         for (std::size_t n = 0; n < nvec; n++)
370           if (std::abs(vec[n]) > cutoff)
371             ++sz_per_row[n];
372       }
373     }
374   }
375   TG.Node().all_reduce_in_place_n(sz_per_row.begin(), sz_per_row.size(), std::plus<>());
376 
377   using Alloc = shared_allocator<SPComplexType>;
378   SpCType_shm_csr_matrix::base ucsr(tp_ul_ul{nvec, Qdim}, tp_ul_ul{0, 0}, sz_per_row, Alloc(TG.Node()));
379 
380   using mat_wrapper = csr::matrix_emplace_wrapper<SpCType_shm_csr_matrix::base>;
381   mat_wrapper ucsr_wrapper(ucsr, TG.Node());
382 
383   cnt = 0;
384   for (int a = 0; a < NAEA; a++)
385   {
386     for (int k = 0; k < NMO; k++, cnt++)
387     {
388       if (cnt < ak0)
389         continue;
390       if (cnt >= ak1)
391         break;
392       std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
393       auto Aa = (*Alpha)[a];
394       for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
395       {
396         auto Aai = Aa.non_zero_values_data()[ip];
397         auto i   = Aa.non_zero_indices2_data()[ip];
398         csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
399       }
400       for (std::size_t n = 0; n < nvec; n++)
401         if (std::abs(vec[n]) > cutoff)
402           ucsr_wrapper.emplace(std::forward_as_tuple(n, cnt, vec[n]));
403     }
404   }
405   // reset "amount of work done" to full alpha piece
406   cnt = NAEA * NMO;
407   if (type == COLLINEAR)
408   {
409     // reset "shift"
410     for (int a = 0; a < NAEB; a++)
411     {
412       for (int k = 0; k < NMO; k++, cnt++)
413       {
414         if (cnt < ak0)
415           continue;
416         if (cnt >= ak1)
417           break;
418         std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
419         auto Aa = (*Beta)[a];
420         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
421         {
422           auto Aai = Aa.non_zero_values_data()[ip];
423           auto i   = Aa.non_zero_indices2_data()[ip];
424           csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
425         }
426         for (std::size_t n = 0; n < nvec; n++)
427           if (std::abs(vec[n]) > cutoff)
428             ucsr_wrapper.emplace(std::forward_as_tuple(n, cnt, vec[n]));
429       }
430     }
431   }
432   ucsr_wrapper.push_buffer();
433   return SpCType_shm_csr_matrix(std::move(ucsr));
434 }
435 
436 // This is needed to allow generic implementation of SparseTensorIO while catching errors at compile time.
437 template<typename T,
438          class task_group,
439          class PsiT_Type,
440          typename = typename std::enable_if_t<not(std::is_same<T, std::complex<double>>::value or
441                                                   std::is_same<T, std::complex<float>>::value)>,
442          typename = void>
halfRotateCholeskyMatrixForBias(WALKER_TYPES type,task_group & TG,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,double cutoff=1e-6)443 SpVType_shm_csr_matrix halfRotateCholeskyMatrixForBias(WALKER_TYPES type,
444                                                        task_group& TG,
445                                                        PsiT_Type* Alpha,
446                                                        PsiT_Type* Beta,
447                                                        SpVType_shm_csr_matrix const& CholMat,
448                                                        double cutoff = 1e-6)
449 {
450   print_stacktrace throw std::runtime_error(
451       "Error: Incorrect template parameter in halfRotateCholeskyMatrixForBias. \n");
452   using Alloc = shared_allocator<SPRealType>;
453   SpVType_shm_csr_matrix csr(tp_ul_ul{1, 1}, tp_ul_ul{0, 0}, 1, Alloc(TG.Node()));
454   return csr;
455 }
456 
457 } // namespace sparse_rotate
458 
459 namespace ma_rotate
460 {
461 template<class MultiArray2D, class task_group, class PsiT_Type>
halfRotateCholeskyMatrix(WALKER_TYPES type,task_group & TG,int k0,int kN,MultiArray2D && Q,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,bool transpose,bool conjV=false,double cutoff=1e-6)462 void halfRotateCholeskyMatrix(WALKER_TYPES type,
463                               task_group& TG,
464                               int k0,
465                               int kN,
466                               MultiArray2D&& Q,
467                               PsiT_Type* Alpha,
468                               PsiT_Type* Beta,
469                               SpVType_shm_csr_matrix const& CholMat,
470                               bool transpose,
471                               bool conjV    = false,
472                               double cutoff = 1e-6)
473 {
474   int NAEA = Alpha->size(0);
475   int NAEB = 0;
476   int NMO  = Alpha->size(1);
477   if (type == COLLINEAR)
478     NAEB = Beta->size(0);
479   int nvec   = CholMat.size(1);
480   int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
481 
482   assert(CholMat.size(0) == NMO * NMO);
483   if (type == CLOSED && kN > NMO)
484     APP_ABORT(" Error: kN > NMO in halfRotateCholeskyMatrix. \n");
485 
486   // map from [0:2*NMO) to [0:NMO) in collinear case
487   int k0_alpha = 0, k0_beta = 0, kN_alpha = 0, kN_beta = 0;
488   if (k0 >= 0)
489   {
490     k0_alpha = std::min(k0, NMO);
491     kN_alpha = std::min(kN, NMO);
492     if (type == COLLINEAR)
493     {
494       k0_beta = std::max(k0, NMO) - NMO;
495       kN_beta = std::max(kN, NMO) - NMO;
496     }
497   }
498 
499   int ak0, ak1;
500   int Qdim = NAEA * (kN_alpha - k0_alpha) + NAEB * (kN_beta - k0_beta);
501   if (transpose)
502   {
503     assert(Q.size(0) == nvec);
504     assert(Q.size(1) == Qdim);
505   }
506   else
507   {
508     assert(Q.size(0) == Qdim);
509     assert(Q.size(1) == nvec);
510   }
511   std::tie(ak0, ak1) = FairDivideBoundary(coreid, Qdim, ncores);
512 
513   if (type == NONCOLLINEAR)
514     APP_ABORT(" GHF not yet implemented. \n");
515 
516   int cnt = 0;
517   for (int k = k0_alpha; k < kN_alpha; k++)
518   {
519     for (int a = 0; a < NAEA; a++, cnt++)
520     {
521       if (cnt < ak0)
522         continue;
523       if (cnt >= ak1)
524         break;
525       if (transpose)
526       {
527         auto vec = Q(Q.extension(0), cnt);
528         for (auto& v : vec)
529           v = SPComplexType(0, 0);
530         auto Aa = (*Alpha)[a];
531         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
532         {
533           auto Aai = Aa.non_zero_values_data()[ip];
534           auto i   = Aa.non_zero_indices2_data()[ip];
535           if (conjV)
536             csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
537           else
538             csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
539         }
540       }
541       else
542       {
543         auto vec = Q[cnt];
544         for (auto& v : vec)
545           v = SPComplexType(0, 0);
546         auto Aa = (*Alpha)[a];
547         for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
548         {
549           auto Aai = Aa.non_zero_values_data()[ip];
550           auto i   = Aa.non_zero_indices2_data()[ip];
551           if (conjV)
552             csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
553           else
554             csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
555         }
556       }
557     }
558   }
559   // reset "amount of work done" to full alpha piece
560   cnt = NAEA * (kN_alpha - k0_alpha);
561   if (type == COLLINEAR)
562   {
563     // reset "shift"
564     for (int k = k0_beta; k < kN_beta; k++)
565     {
566       for (int a = 0; a < NAEB; a++, cnt++)
567       {
568         if (cnt < ak0)
569           continue;
570         if (cnt >= ak1)
571           break;
572         if (transpose)
573         {
574           auto vec = Q(Q.extension(0), cnt);
575           for (auto& v : vec)
576             v = SPComplexType(0, 0);
577           auto Aa = (*Beta)[a];
578           for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
579           {
580             auto Aai = Aa.non_zero_values_data()[ip];
581             auto i   = Aa.non_zero_indices2_data()[ip];
582             if (conjV)
583               csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
584             else
585               csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
586           }
587         }
588         else
589         {
590           auto vec = Q[cnt];
591           for (auto& v : vec)
592             v = SPComplexType(0, 0);
593           auto Aa = (*Beta)[a];
594           for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
595           {
596             auto Aai = Aa.non_zero_values_data()[ip];
597             auto i   = Aa.non_zero_indices2_data()[ip];
598             if (conjV)
599               csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
600             else
601               csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
602           }
603         }
604       }
605     }
606   }
607   TG.Node().barrier();
608 }
609 
610 // design for compact arrays
611 /*
612  * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
613  * Case:
614  *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] L[i][k][n]
615  *       - In collinear case, two separate calls are made for each spin channel.
616  *   - Non-collinear: L[a][n][sk] = sum_i A[a][si] L[i][k][n]   // [si] == [s][i] combined spinor index
617  *       - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
618  */
619 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLank(MultiArray2DA && Aai,MultiArray3DB && Likn,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)620 void getLank(MultiArray2DA&& Aai,
621              MultiArray3DB&& Likn,
622              MultiArray3DC&& Lank,
623              MultiArray2D&& buff,
624              bool noncollinear = false)
625 {
626   int npol = noncollinear ? 2 : 1;
627   int na   = Aai.size(0);
628   if (na == 0)
629     return;
630   int ni    = Aai.size(1) / npol;
631   int nk    = Likn.size(1);
632   int nchol = Likn.size(2);
633   assert(Likn.size(0) == ni);
634   assert(Lank.size(0) == na);
635   assert(Lank.size(1) == nchol);
636   assert(Lank.size(2) == nk * npol);
637   assert(buff.size(0) >= npol * nk);
638   assert(buff.size(1) >= nchol);
639   if (noncollinear)
640     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
641 
642   using elementA = typename std::decay<MultiArray2DA>::type::element;
643   using element  = typename std::decay<MultiArray3DC>::type::element;
644   boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
645   boost::multi::array_ref<element, 2> Li_kn(to_address(Likn.origin()), {ni, nk * nchol});
646   boost::multi::array_ref<element, 2> Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol});
647 
648   ma::product(Aas_i, Li_kn, Las_kn);
649   for (int a = 0; a < na; a++)
650   {
651     boost::multi::array_ref<element, 2> Lskn(to_address(Lank[a].origin()), {npol * nk, nchol});
652     boost::multi::array_ref<element, 2> Lnsk(to_address(Lank[a].origin()), {nchol, npol * nk});
653     buff({0, npol * nk}, {0, nchol}) = Lskn;
654     ma::transpose(buff({0, npol * nk}, {0, nchol}), Lnsk);
655   }
656 }
657 
658 /*
659  * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
660  * Case:
661  *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] conj(L[k][i][n])
662  *       - In collinear case, two separate calls are made for each spin channel.
663  *   - Non-collinear: L[a][n][sk] = sum_i A[a][si] conj(L[k][i][n])   // [si] == [s][i] combined spinor index
664  *       - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
665  */
666 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLank_from_Lkin(MultiArray2DA && Aai,MultiArray3DB && Lkin,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)667 void getLank_from_Lkin(MultiArray2DA&& Aai,
668                        MultiArray3DB&& Lkin,
669                        MultiArray3DC&& Lank,
670                        MultiArray2D&& buff,
671                        bool noncollinear = false)
672 {
673   int npol = noncollinear ? 2 : 1;
674   int na   = Aai.size(0);
675   if (na == 0)
676     return;
677   int ni    = Aai.size(1) / npol;
678   int nk    = Lkin.size(0);
679   int nchol = Lkin.size(2);
680   assert(Lkin.size(1) == ni);
681   assert(Lank.size(0) == na);
682   assert(Lank.size(1) == nchol);
683   assert(Lank.size(2) == nk * npol);
684   assert(buff.num_elements() >= na * npol * nchol);
685   if (noncollinear)
686     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
687 
688   using Type     = typename std::decay<MultiArray3DC>::type::element;
689   using elementA = typename std::decay<MultiArray2DA>::type::element;
690   boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
691   boost::multi::array_ref<Type, 2> bnas(to_address(buff.origin()), {nchol, na * npol});
692   // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n])
693   // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n])
694   for (int k = 0; k < nk; k++)
695   {
696     ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bnas);
697     for (int a = 0; a < na; a++)
698       for (int n = 0; n < nchol; n++)
699         for (int p = 0; p < npol; p++)
700           Lank[a][n][p * nk + k] = bnas[n][a * npol + p];
701   }
702 }
703 
704 } // namespace ma_rotate
705 
706 namespace ma_rotate_padded
707 {
708 // designed for padded arrays
709 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC>
getLakn_Lank(MultiArray2DA && Aai,MultiArray3DB && Likn,MultiArray3DC && Lakn,MultiArray3DC && Lank,bool noncollinear=false)710 void getLakn_Lank(MultiArray2DA&& Aai,
711                   MultiArray3DB&& Likn,
712                   MultiArray3DC&& Lakn,
713                   MultiArray3DC&& Lank,
714                   bool noncollinear = false)
715 {
716   int npol = noncollinear ? 2 : 1;
717   int na   = Aai.size(0);
718   if (na == 0)
719     return;
720   int ni = Aai.size(1) / npol;
721 
722   int nmo   = Likn.size(0);
723   int nchol = Likn.size(2);
724   assert(Likn.size(1) == nmo);
725 
726   assert(Lakn.size(1) == npol * nmo);
727   assert(Lakn.size(2) == nchol);
728 
729   assert(Lakn.size(0) >= na);
730   assert(Lakn.size(0) == Lank.size(0));
731   assert(Lank.size(1) == nchol);
732   assert(Lank.size(2) == npol * nmo);
733 
734   if (noncollinear)
735     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
736 
737   using elmA = typename std::decay<MultiArray2DA>::type::element;
738   using elmB = typename std::decay<MultiArray3DB>::type::element;
739   using elmC = typename std::decay<MultiArray3DC>::type::element;
740 
741   boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
742   boost::multi::array_ref<elmB, 2, decltype(Likn.origin())> Li_kn(Likn.origin(), {ni, nmo * nchol});
743   boost::multi::array_ref<elmC, 2, decltype(Lakn.origin())> Las_kn(Lakn.origin(), {na * npol, nmo * nchol});
744 
745   ma::product(Aas_i, Li_kn, Las_kn);
746   for (int a = 0; a < na; a++)
747     ma::transpose(Lakn[a], Lank[a]);
748 }
749 
750 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLakn_Lank_from_Lkin(MultiArray2DA && Aai,MultiArray3DB && Lkin,MultiArray3DC && Lakn,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)751 void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
752                             MultiArray3DB&& Lkin,
753                             MultiArray3DC&& Lakn,
754                             MultiArray3DC&& Lank,
755                             MultiArray2D&& buff,
756                             bool noncollinear = false)
757 {
758   int npol = noncollinear ? 2 : 1;
759   int na   = Aai.size(0);
760   if (na == 0)
761     return;
762   int ni = Aai.size(1) / npol;
763 
764   int nmo   = Lkin.size(0);
765   int nchol = Lkin.size(2);
766   assert(Lkin.size(1) == nmo);
767 
768   assert(Lakn.size(1) == npol * nmo);
769   assert(Lakn.size(2) == nchol);
770 
771   assert(Lakn.size(0) >= na);
772   assert(Lakn.size(0) == Lank.size(0));
773   assert(Lank.size(1) == nchol);
774   assert(Lank.size(2) == npol * nmo);
775 
776   if (noncollinear)
777     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
778 
779   assert(buff.num_elements() >= na * npol * nchol);
780 
781   using ptr2 = typename std::decay<MultiArray2D>::type::element_ptr;
782   using elm2 = typename std::decay<MultiArray2D>::type::element;
783   using elmA = typename std::decay<MultiArray2DA>::type::element;
784 
785   boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
786   boost::multi::array_ref<elm2, 2, ptr2> bnas(buff.origin(), {nchol, na * npol});
787   // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
788   for (int k = 0; k < nmo; k++)
789   {
790     ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas);
791     for (int a = 0; a < na; a++)
792       for (int p = 0; p < npol; p++)
793         Lakn[a][p * nmo + k] = bnas({0, nchol}, a * npol + p);
794   }
795   for (int a = 0; a < na; a++)
796     ma::transpose(Lakn[a], Lank[a]);
797 }
798 
799 
800 } // namespace ma_rotate_padded
801 
802 
803 } // namespace afqmc
804 
805 } // namespace qmcplusplus
806 
807 #endif
808