1 ////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License. See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Miguel A. Morales, moralessilva2@llnl.gov
9 // Lawrence Livermore National Laboratory
10 //
11 // File created by:
12 // Miguel A. Morales, moralessilva2@llnl.gov
13 // Lawrence Livermore National Laboratory
14 ////////////////////////////////////////////////////////////////////////////////
15
16
17 #ifndef AFQMC_ROTATE_HPP
18 #define AFQMC_ROTATE_HPP
19
20 #include <numeric>
21 #include "AFQMC/config.h"
22 #include "Utilities/FairDivide.h"
23 #include "AFQMC/Utilities/taskgroup.h"
24 #include "AFQMC/Matrix/csr_matrix.hpp"
25 #include "AFQMC/Numerics/ma_operations.hpp"
26 #include "AFQMC/Numerics/csr_blas.hpp"
27 #include "AFQMC/Matrix/matrix_emplace_wrapper.hpp"
28 #include "multi/array_ref.hpp"
29 #include "AFQMC/Utilities/afqmc_TTI.hpp"
30 #include "mpi.h"
31
32 namespace qmcplusplus
33 {
34 namespace afqmc
35 {
36 namespace sparse_rotate
37 {
38 /*
39 * Performs a (left) half rotation (and a possible transposition) of a Cholesky matrix.
40 * The rotated matrix is sotored in "tuple" form in the provided container using emplace_back.
41 * The generation of the matrix is distributed among the containers
42 * of all the cores in the node.
43 * Input:
44 * -alpha: Sparse hermitian of rotation matrix for spin up, e.g. transpose(conj(Aup)).
45 * -beta: Sparse hermitian of rotation matrix for spin down, e.g. transpose(conj(Aup)).
46 * -A: Input sparse cholesky matrix.
47 * -cutoff: Value below which elements of rotated matrix are ignored.
48 * Output:
49 * -B: Container (e.g. std::vector) with a segment of the non-zero terms of the rotated matrix.
50 * If transpose==false, the elements in the container are ordered by (a,k) values.
51 *
52 * If transposed==true:
53 * B(n,ka-ka0) = sum_i^M alpha(a,i) * Spvn(ik,n)
54 * B(n,ka+N*M-ka0) = sum_i^M beta(a,i) * Spvn(ki,n)
55 * else:
56 * B(ka-ka0,n) = sum_i A(a,i) * Spvn(ik,n)
57 * B(ka+N*M-ka0,n) = sum_i^M beta(a,i) * Spvn(ik,n),
58 * where M/N is the number of rows/columns of alpha and beta.
59 * The number of rows of Spvn should be equal to M*M.
60 * If conjV=true, Spvn(ik,n) -> conj(Spvn(ki,n)) in the expressions above
61 */
62 template<class Container, class task_group, class PsiT_Type>
halfRotateCholeskyMatrix(WALKER_TYPES type,task_group & TG,int k0,int kN,Container & Q,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,bool transpose,bool conjV=false,double cutoff=1e-6,bool reserve_to_fit_=true)63 void halfRotateCholeskyMatrix(WALKER_TYPES type,
64 task_group& TG,
65 int k0,
66 int kN,
67 Container& Q,
68 PsiT_Type* Alpha,
69 PsiT_Type* Beta,
70 SpVType_shm_csr_matrix const& CholMat,
71 bool transpose,
72 bool conjV = false,
73 double cutoff = 1e-6,
74 bool reserve_to_fit_ = true)
75 {
76 int NAEA = Alpha->size(0);
77 int NAEB = Alpha->size(0);
78 int NMO = Alpha->size(1);
79 if (type == COLLINEAR)
80 NAEB = Beta->size(0);
81 int nvec = CholMat.size(1);
82 int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
83
84 assert(CholMat.size(0) == NMO * NMO);
85 assert(kN > k0);
86 if (type == CLOSED && kN > NMO)
87 APP_ABORT(" Error: kN > NMO in halfRotateCholeskyMatrix. \n");
88
89 // map from [0:2*NMO) to [0:NMO) in collinear case
90 int k0_alpha = 0, k0_beta = 0, kN_alpha = 0, kN_beta = 0;
91 if (k0 >= 0)
92 {
93 k0_alpha = std::min(k0, NMO);
94 kN_alpha = std::min(kN, NMO);
95 if (type == COLLINEAR)
96 {
97 kN_beta = std::max(kN, NMO) - NMO;
98 k0_beta = std::max(k0, NMO) - NMO;
99 }
100 }
101
102 int ak0, ak1;
103 int Qdim = NAEA * (kN_alpha - k0_alpha) + NAEB * (kN_beta - k0_beta);
104 if (transpose)
105 {
106 //if(not check_shape(Q,{nvec,Qdim}))
107 if (not(Q.size(0) == nvec && Q.size(1) == Qdim))
108 APP_ABORT(" Error: Container Q has incorrect dimensions in halfRotateCholeskyMatrix. \n");
109 }
110 else
111 {
112 //if(not check_shape(Q,{Qdim,nvec}))
113 if (not(Q.size(0) == Qdim && Q.size(1) == nvec))
114 APP_ABORT(" Error: Container Q has incorrect dimensions in halfRotateCholeskyMatrix. \n");
115 }
116 std::tie(ak0, ak1) = FairDivideBoundary(coreid, Qdim, ncores);
117
118 if (type == NONCOLLINEAR)
119 APP_ABORT(" GHF not yet implemented. \n");
120
121 boost::multi::array<SPComplexType, 1> vec(iextensions<1u>{nvec});
122 if (reserve_to_fit_)
123 {
124 std::vector<std::size_t> sz_per_row(Qdim);
125 int cnt = 0;
126 for (int k = k0_alpha; k < kN_alpha; k++)
127 {
128 for (int a = 0; a < NAEA; a++, cnt++)
129 {
130 if (cnt < ak0)
131 continue;
132 if (cnt >= ak1)
133 break;
134 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
135 auto Aa = (*Alpha)[a];
136 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
137 {
138 auto Aai = Aa.non_zero_values_data()[ip];
139 auto i = Aa.non_zero_indices2_data()[ip];
140 if (conjV)
141 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
142 else
143 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
144 }
145 if (transpose)
146 {
147 for (int n = 0; n < nvec; n++)
148 if (std::abs(vec[n]) > cutoff)
149 ++sz_per_row[n];
150 }
151 else
152 {
153 for (int n = 0; n < nvec; n++)
154 if (std::abs(vec[n]) > cutoff)
155 ++sz_per_row[cnt];
156 }
157 }
158 }
159 // reset "amount of work done" to full alpha piece
160 cnt = NAEA * (kN_alpha - k0_alpha);
161 if (type == COLLINEAR)
162 {
163 // reset "shift"
164 for (int k = k0_beta; k < kN_beta; k++)
165 {
166 for (int a = 0; a < NAEB; a++, cnt++)
167 {
168 if (cnt < ak0)
169 continue;
170 if (cnt >= ak1)
171 break;
172 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
173 auto Aa = (*Beta)[a];
174 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
175 {
176 auto Aai = Aa.non_zero_values_data()[ip];
177 auto i = Aa.non_zero_indices2_data()[ip];
178 if (conjV)
179 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
180 else
181 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
182 }
183 if (transpose)
184 {
185 for (int n = 0; n < nvec; n++)
186 if (std::abs(vec[n]) > cutoff)
187 ++sz_per_row[n];
188 }
189 else
190 {
191 for (int n = 0; n < nvec; n++)
192 if (std::abs(vec[n]) > cutoff)
193 ++sz_per_row[cnt];
194 }
195 }
196 }
197 }
198 TG.Node().all_reduce_in_place_n(sz_per_row.begin(), sz_per_row.size(), std::plus<>());
199 reserve_to_fit(Q, sz_per_row);
200 } // else
201 // Q.reserve( std::size_t(0.1 * nvec * NEL * NMO / ncores ) ); // by default, assume 10% sparsity
202
203 int cnt = 0;
204 for (int k = k0_alpha; k < kN_alpha; k++)
205 {
206 for (int a = 0; a < NAEA; a++, cnt++)
207 {
208 if (cnt < ak0)
209 continue;
210 if (cnt >= ak1)
211 break;
212 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
213 auto Aa = (*Alpha)[a];
214 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
215 {
216 auto Aai = Aa.non_zero_values_data()[ip];
217 auto i = Aa.non_zero_indices2_data()[ip];
218 if (conjV)
219 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
220 else
221 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
222 }
223 if (transpose)
224 {
225 for (int n = 0; n < nvec; n++)
226 if (std::abs(vec[n]) > cutoff)
227 emplace(Q, std::forward_as_tuple(n, cnt, vec[n]));
228 }
229 else
230 {
231 for (int n = 0; n < nvec; n++)
232 if (std::abs(vec[n]) > cutoff)
233 emplace(Q, std::forward_as_tuple(cnt, n, vec[n]));
234 }
235 }
236 }
237 // reset "amount of work done" to full alpha piece
238 cnt = NAEA * (kN_alpha - k0_alpha);
239 if (type == COLLINEAR)
240 {
241 // reset "shift"
242 for (int k = k0_beta; k < kN_beta; k++)
243 {
244 for (int a = 0; a < NAEB; a++, cnt++)
245 {
246 if (cnt < ak0)
247 continue;
248 if (cnt >= ak1)
249 break;
250 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
251 auto Aa = (*Beta)[a];
252 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
253 {
254 auto Aai = Aa.non_zero_values_data()[ip];
255 auto i = Aa.non_zero_indices2_data()[ip];
256 if (conjV)
257 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
258 else
259 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
260 }
261 if (transpose)
262 {
263 for (int n = 0; n < nvec; n++)
264 if (std::abs(vec[n]) > cutoff)
265 emplace(Q, std::forward_as_tuple(n, cnt, vec[n]));
266 }
267 else
268 {
269 for (int n = 0; n < nvec; n++)
270 if (std::abs(vec[n]) > cutoff)
271 emplace(Q, std::forward_as_tuple(cnt, n, vec[n]));
272 }
273 }
274 }
275 }
276 }
277
278 /*
279 * Calculates the rotated Cholesky matrix used in the calculation of the vias potential.
280 * v(n,ak) = sum_i A(a,i) * Spvn(ik,n) A(a,i)=PsiT(i,a)*
281 * v(n,N*M+ak) = sum_i^M beta(a,i) * Spvn(ik,n),
282 * where M/N is the number of rows/columns of alpha and beta.
283 * The number of rows of Spvn should be equal to M*M.
284 * Since we only rotate the "local" cholesky matrix, the algorithm is only parallelized
285 * over a node. In principle, it is possible to spread this over equivalent nodes.
286 */
287 template<typename T,
288 class task_group,
289 class PsiT_Type,
290 typename = typename std::enable_if_t<std::is_same<T, std::complex<double>>::value or
291 std::is_same<T, std::complex<float>>::value>>
halfRotateCholeskyMatrixForBias(WALKER_TYPES type,task_group & TG,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,double cutoff=1e-6)292 SpCType_shm_csr_matrix halfRotateCholeskyMatrixForBias(WALKER_TYPES type,
293 task_group& TG,
294 PsiT_Type* Alpha,
295 PsiT_Type* Beta,
296 SpVType_shm_csr_matrix const& CholMat,
297 double cutoff = 1e-6)
298 {
299 int NAEA = Alpha->size(0);
300 int NAEB = Alpha->size(0);
301 int NMO = Alpha->size(1);
302 if (type != CLOSED)
303 NAEB = Beta->size(0);
304 int nvec = CholMat.size(1);
305 int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
306
307 // to speed up, generate new communicator for eqv_nodes and split full work among all
308 // cores in this comm. Then build from distributed container?
309
310 assert(CholMat.size(0) == NMO * NMO);
311
312 std::size_t Qdim = NAEA * NMO;
313 if (type == COLLINEAR)
314 Qdim += NAEB * NMO;
315 if (type == NONCOLLINEAR)
316 Qdim = 2 * NMO * (NAEA + NAEB);
317 std::size_t ak0, ak1;
318 std::tie(ak0, ak1) = FairDivideBoundary(std::size_t(coreid), Qdim, std::size_t(ncores));
319
320 if (type == NONCOLLINEAR)
321 APP_ABORT(" GHF not yet implemented. \n");
322
323 boost::multi::array<SPComplexType, 1> vec(iextensions<1u>{nvec});
324 std::vector<std::size_t> sz_per_row(nvec);
325 std::size_t cnt = 0;
326 for (int a = 0; a < NAEA; a++)
327 {
328 for (int k = 0; k < NMO; k++, cnt++)
329 {
330 if (cnt < ak0)
331 continue;
332 if (cnt >= ak1)
333 break;
334 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
335 auto Aa = (*Alpha)[a];
336 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
337 {
338 auto Aai = Aa.non_zero_values_data()[ip];
339 auto i = Aa.non_zero_indices2_data()[ip];
340 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
341 }
342 for (std::size_t n = 0; n < nvec; n++)
343 if (std::abs(vec[n]) > cutoff)
344 ++(sz_per_row[n]);
345 }
346 }
347
348 // reset "amount of work done" to full alpha piece
349 cnt = NAEA * NMO;
350 if (type == COLLINEAR)
351 {
352 // reset "shift"
353 for (int a = 0; a < NAEB; a++)
354 {
355 for (int k = 0; k < NMO; k++, cnt++)
356 {
357 if (cnt < ak0)
358 continue;
359 if (cnt >= ak1)
360 break;
361 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
362 auto Aa = (*Beta)[a];
363 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
364 {
365 auto Aai = Aa.non_zero_values_data()[ip];
366 auto i = Aa.non_zero_indices2_data()[ip];
367 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
368 }
369 for (std::size_t n = 0; n < nvec; n++)
370 if (std::abs(vec[n]) > cutoff)
371 ++sz_per_row[n];
372 }
373 }
374 }
375 TG.Node().all_reduce_in_place_n(sz_per_row.begin(), sz_per_row.size(), std::plus<>());
376
377 using Alloc = shared_allocator<SPComplexType>;
378 SpCType_shm_csr_matrix::base ucsr(tp_ul_ul{nvec, Qdim}, tp_ul_ul{0, 0}, sz_per_row, Alloc(TG.Node()));
379
380 using mat_wrapper = csr::matrix_emplace_wrapper<SpCType_shm_csr_matrix::base>;
381 mat_wrapper ucsr_wrapper(ucsr, TG.Node());
382
383 cnt = 0;
384 for (int a = 0; a < NAEA; a++)
385 {
386 for (int k = 0; k < NMO; k++, cnt++)
387 {
388 if (cnt < ak0)
389 continue;
390 if (cnt >= ak1)
391 break;
392 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
393 auto Aa = (*Alpha)[a];
394 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
395 {
396 auto Aai = Aa.non_zero_values_data()[ip];
397 auto i = Aa.non_zero_indices2_data()[ip];
398 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
399 }
400 for (std::size_t n = 0; n < nvec; n++)
401 if (std::abs(vec[n]) > cutoff)
402 ucsr_wrapper.emplace(std::forward_as_tuple(n, cnt, vec[n]));
403 }
404 }
405 // reset "amount of work done" to full alpha piece
406 cnt = NAEA * NMO;
407 if (type == COLLINEAR)
408 {
409 // reset "shift"
410 for (int a = 0; a < NAEB; a++)
411 {
412 for (int k = 0; k < NMO; k++, cnt++)
413 {
414 if (cnt < ak0)
415 continue;
416 if (cnt >= ak1)
417 break;
418 std::fill_n(vec.origin(), vec.size(), SPComplexType(0, 0));
419 auto Aa = (*Beta)[a];
420 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
421 {
422 auto Aai = Aa.non_zero_values_data()[ip];
423 auto i = Aa.non_zero_indices2_data()[ip];
424 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
425 }
426 for (std::size_t n = 0; n < nvec; n++)
427 if (std::abs(vec[n]) > cutoff)
428 ucsr_wrapper.emplace(std::forward_as_tuple(n, cnt, vec[n]));
429 }
430 }
431 }
432 ucsr_wrapper.push_buffer();
433 return SpCType_shm_csr_matrix(std::move(ucsr));
434 }
435
436 // This is needed to allow generic implementation of SparseTensorIO while catching errors at compile time.
437 template<typename T,
438 class task_group,
439 class PsiT_Type,
440 typename = typename std::enable_if_t<not(std::is_same<T, std::complex<double>>::value or
441 std::is_same<T, std::complex<float>>::value)>,
442 typename = void>
halfRotateCholeskyMatrixForBias(WALKER_TYPES type,task_group & TG,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,double cutoff=1e-6)443 SpVType_shm_csr_matrix halfRotateCholeskyMatrixForBias(WALKER_TYPES type,
444 task_group& TG,
445 PsiT_Type* Alpha,
446 PsiT_Type* Beta,
447 SpVType_shm_csr_matrix const& CholMat,
448 double cutoff = 1e-6)
449 {
450 print_stacktrace throw std::runtime_error(
451 "Error: Incorrect template parameter in halfRotateCholeskyMatrixForBias. \n");
452 using Alloc = shared_allocator<SPRealType>;
453 SpVType_shm_csr_matrix csr(tp_ul_ul{1, 1}, tp_ul_ul{0, 0}, 1, Alloc(TG.Node()));
454 return csr;
455 }
456
457 } // namespace sparse_rotate
458
459 namespace ma_rotate
460 {
461 template<class MultiArray2D, class task_group, class PsiT_Type>
halfRotateCholeskyMatrix(WALKER_TYPES type,task_group & TG,int k0,int kN,MultiArray2D && Q,PsiT_Type * Alpha,PsiT_Type * Beta,SpVType_shm_csr_matrix const & CholMat,bool transpose,bool conjV=false,double cutoff=1e-6)462 void halfRotateCholeskyMatrix(WALKER_TYPES type,
463 task_group& TG,
464 int k0,
465 int kN,
466 MultiArray2D&& Q,
467 PsiT_Type* Alpha,
468 PsiT_Type* Beta,
469 SpVType_shm_csr_matrix const& CholMat,
470 bool transpose,
471 bool conjV = false,
472 double cutoff = 1e-6)
473 {
474 int NAEA = Alpha->size(0);
475 int NAEB = 0;
476 int NMO = Alpha->size(1);
477 if (type == COLLINEAR)
478 NAEB = Beta->size(0);
479 int nvec = CholMat.size(1);
480 int ncores = TG.getTotalCores(), coreid = TG.getCoreID();
481
482 assert(CholMat.size(0) == NMO * NMO);
483 if (type == CLOSED && kN > NMO)
484 APP_ABORT(" Error: kN > NMO in halfRotateCholeskyMatrix. \n");
485
486 // map from [0:2*NMO) to [0:NMO) in collinear case
487 int k0_alpha = 0, k0_beta = 0, kN_alpha = 0, kN_beta = 0;
488 if (k0 >= 0)
489 {
490 k0_alpha = std::min(k0, NMO);
491 kN_alpha = std::min(kN, NMO);
492 if (type == COLLINEAR)
493 {
494 k0_beta = std::max(k0, NMO) - NMO;
495 kN_beta = std::max(kN, NMO) - NMO;
496 }
497 }
498
499 int ak0, ak1;
500 int Qdim = NAEA * (kN_alpha - k0_alpha) + NAEB * (kN_beta - k0_beta);
501 if (transpose)
502 {
503 assert(Q.size(0) == nvec);
504 assert(Q.size(1) == Qdim);
505 }
506 else
507 {
508 assert(Q.size(0) == Qdim);
509 assert(Q.size(1) == nvec);
510 }
511 std::tie(ak0, ak1) = FairDivideBoundary(coreid, Qdim, ncores);
512
513 if (type == NONCOLLINEAR)
514 APP_ABORT(" GHF not yet implemented. \n");
515
516 int cnt = 0;
517 for (int k = k0_alpha; k < kN_alpha; k++)
518 {
519 for (int a = 0; a < NAEA; a++, cnt++)
520 {
521 if (cnt < ak0)
522 continue;
523 if (cnt >= ak1)
524 break;
525 if (transpose)
526 {
527 auto vec = Q(Q.extension(0), cnt);
528 for (auto& v : vec)
529 v = SPComplexType(0, 0);
530 auto Aa = (*Alpha)[a];
531 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
532 {
533 auto Aai = Aa.non_zero_values_data()[ip];
534 auto i = Aa.non_zero_indices2_data()[ip];
535 if (conjV)
536 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
537 else
538 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
539 }
540 }
541 else
542 {
543 auto vec = Q[cnt];
544 for (auto& v : vec)
545 v = SPComplexType(0, 0);
546 auto Aa = (*Alpha)[a];
547 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
548 {
549 auto Aai = Aa.non_zero_values_data()[ip];
550 auto i = Aa.non_zero_indices2_data()[ip];
551 if (conjV)
552 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
553 else
554 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
555 }
556 }
557 }
558 }
559 // reset "amount of work done" to full alpha piece
560 cnt = NAEA * (kN_alpha - k0_alpha);
561 if (type == COLLINEAR)
562 {
563 // reset "shift"
564 for (int k = k0_beta; k < kN_beta; k++)
565 {
566 for (int a = 0; a < NAEB; a++, cnt++)
567 {
568 if (cnt < ak0)
569 continue;
570 if (cnt >= ak1)
571 break;
572 if (transpose)
573 {
574 auto vec = Q(Q.extension(0), cnt);
575 for (auto& v : vec)
576 v = SPComplexType(0, 0);
577 auto Aa = (*Beta)[a];
578 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
579 {
580 auto Aai = Aa.non_zero_values_data()[ip];
581 auto i = Aa.non_zero_indices2_data()[ip];
582 if (conjV)
583 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
584 else
585 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
586 }
587 }
588 else
589 {
590 auto vec = Q[cnt];
591 for (auto& v : vec)
592 v = SPComplexType(0, 0);
593 auto Aa = (*Beta)[a];
594 for (int ip = 0; ip < Aa.num_non_zero_elements(); ++ip)
595 {
596 auto Aai = Aa.non_zero_values_data()[ip];
597 auto i = Aa.non_zero_indices2_data()[ip];
598 if (conjV)
599 csr::axpy('C', Aai, CholMat[k * NMO + i], vec);
600 else
601 csr::axpy('N', Aai, CholMat[i * NMO + k], vec);
602 }
603 }
604 }
605 }
606 }
607 TG.Node().barrier();
608 }
609
610 // design for compact arrays
611 /*
612 * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
613 * Case:
614 * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] L[i][k][n]
615 * - In collinear case, two separate calls are made for each spin channel.
616 * - Non-collinear: L[a][n][sk] = sum_i A[a][si] L[i][k][n] // [si] == [s][i] combined spinor index
617 * - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
618 */
619 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLank(MultiArray2DA && Aai,MultiArray3DB && Likn,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)620 void getLank(MultiArray2DA&& Aai,
621 MultiArray3DB&& Likn,
622 MultiArray3DC&& Lank,
623 MultiArray2D&& buff,
624 bool noncollinear = false)
625 {
626 int npol = noncollinear ? 2 : 1;
627 int na = Aai.size(0);
628 if (na == 0)
629 return;
630 int ni = Aai.size(1) / npol;
631 int nk = Likn.size(1);
632 int nchol = Likn.size(2);
633 assert(Likn.size(0) == ni);
634 assert(Lank.size(0) == na);
635 assert(Lank.size(1) == nchol);
636 assert(Lank.size(2) == nk * npol);
637 assert(buff.size(0) >= npol * nk);
638 assert(buff.size(1) >= nchol);
639 if (noncollinear)
640 assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
641
642 using elementA = typename std::decay<MultiArray2DA>::type::element;
643 using element = typename std::decay<MultiArray3DC>::type::element;
644 boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
645 boost::multi::array_ref<element, 2> Li_kn(to_address(Likn.origin()), {ni, nk * nchol});
646 boost::multi::array_ref<element, 2> Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol});
647
648 ma::product(Aas_i, Li_kn, Las_kn);
649 for (int a = 0; a < na; a++)
650 {
651 boost::multi::array_ref<element, 2> Lskn(to_address(Lank[a].origin()), {npol * nk, nchol});
652 boost::multi::array_ref<element, 2> Lnsk(to_address(Lank[a].origin()), {nchol, npol * nk});
653 buff({0, npol * nk}, {0, nchol}) = Lskn;
654 ma::transpose(buff({0, npol * nk}, {0, nchol}), Lnsk);
655 }
656 }
657
658 /*
659 * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
660 * Case:
661 * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] conj(L[k][i][n])
662 * - In collinear case, two separate calls are made for each spin channel.
663 * - Non-collinear: L[a][n][sk] = sum_i A[a][si] conj(L[k][i][n]) // [si] == [s][i] combined spinor index
664 * - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
665 */
666 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLank_from_Lkin(MultiArray2DA && Aai,MultiArray3DB && Lkin,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)667 void getLank_from_Lkin(MultiArray2DA&& Aai,
668 MultiArray3DB&& Lkin,
669 MultiArray3DC&& Lank,
670 MultiArray2D&& buff,
671 bool noncollinear = false)
672 {
673 int npol = noncollinear ? 2 : 1;
674 int na = Aai.size(0);
675 if (na == 0)
676 return;
677 int ni = Aai.size(1) / npol;
678 int nk = Lkin.size(0);
679 int nchol = Lkin.size(2);
680 assert(Lkin.size(1) == ni);
681 assert(Lank.size(0) == na);
682 assert(Lank.size(1) == nchol);
683 assert(Lank.size(2) == nk * npol);
684 assert(buff.num_elements() >= na * npol * nchol);
685 if (noncollinear)
686 assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
687
688 using Type = typename std::decay<MultiArray3DC>::type::element;
689 using elementA = typename std::decay<MultiArray2DA>::type::element;
690 boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
691 boost::multi::array_ref<Type, 2> bnas(to_address(buff.origin()), {nchol, na * npol});
692 // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n])
693 // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n])
694 for (int k = 0; k < nk; k++)
695 {
696 ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bnas);
697 for (int a = 0; a < na; a++)
698 for (int n = 0; n < nchol; n++)
699 for (int p = 0; p < npol; p++)
700 Lank[a][n][p * nk + k] = bnas[n][a * npol + p];
701 }
702 }
703
704 } // namespace ma_rotate
705
706 namespace ma_rotate_padded
707 {
708 // designed for padded arrays
709 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC>
getLakn_Lank(MultiArray2DA && Aai,MultiArray3DB && Likn,MultiArray3DC && Lakn,MultiArray3DC && Lank,bool noncollinear=false)710 void getLakn_Lank(MultiArray2DA&& Aai,
711 MultiArray3DB&& Likn,
712 MultiArray3DC&& Lakn,
713 MultiArray3DC&& Lank,
714 bool noncollinear = false)
715 {
716 int npol = noncollinear ? 2 : 1;
717 int na = Aai.size(0);
718 if (na == 0)
719 return;
720 int ni = Aai.size(1) / npol;
721
722 int nmo = Likn.size(0);
723 int nchol = Likn.size(2);
724 assert(Likn.size(1) == nmo);
725
726 assert(Lakn.size(1) == npol * nmo);
727 assert(Lakn.size(2) == nchol);
728
729 assert(Lakn.size(0) >= na);
730 assert(Lakn.size(0) == Lank.size(0));
731 assert(Lank.size(1) == nchol);
732 assert(Lank.size(2) == npol * nmo);
733
734 if (noncollinear)
735 assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
736
737 using elmA = typename std::decay<MultiArray2DA>::type::element;
738 using elmB = typename std::decay<MultiArray3DB>::type::element;
739 using elmC = typename std::decay<MultiArray3DC>::type::element;
740
741 boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
742 boost::multi::array_ref<elmB, 2, decltype(Likn.origin())> Li_kn(Likn.origin(), {ni, nmo * nchol});
743 boost::multi::array_ref<elmC, 2, decltype(Lakn.origin())> Las_kn(Lakn.origin(), {na * npol, nmo * nchol});
744
745 ma::product(Aas_i, Li_kn, Las_kn);
746 for (int a = 0; a < na; a++)
747 ma::transpose(Lakn[a], Lank[a]);
748 }
749
750 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
getLakn_Lank_from_Lkin(MultiArray2DA && Aai,MultiArray3DB && Lkin,MultiArray3DC && Lakn,MultiArray3DC && Lank,MultiArray2D && buff,bool noncollinear=false)751 void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
752 MultiArray3DB&& Lkin,
753 MultiArray3DC&& Lakn,
754 MultiArray3DC&& Lank,
755 MultiArray2D&& buff,
756 bool noncollinear = false)
757 {
758 int npol = noncollinear ? 2 : 1;
759 int na = Aai.size(0);
760 if (na == 0)
761 return;
762 int ni = Aai.size(1) / npol;
763
764 int nmo = Lkin.size(0);
765 int nchol = Lkin.size(2);
766 assert(Lkin.size(1) == nmo);
767
768 assert(Lakn.size(1) == npol * nmo);
769 assert(Lakn.size(2) == nchol);
770
771 assert(Lakn.size(0) >= na);
772 assert(Lakn.size(0) == Lank.size(0));
773 assert(Lank.size(1) == nchol);
774 assert(Lank.size(2) == npol * nmo);
775
776 if (noncollinear)
777 assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
778
779 assert(buff.num_elements() >= na * npol * nchol);
780
781 using ptr2 = typename std::decay<MultiArray2D>::type::element_ptr;
782 using elm2 = typename std::decay<MultiArray2D>::type::element;
783 using elmA = typename std::decay<MultiArray2DA>::type::element;
784
785 boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
786 boost::multi::array_ref<elm2, 2, ptr2> bnas(buff.origin(), {nchol, na * npol});
787 // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
788 for (int k = 0; k < nmo; k++)
789 {
790 ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas);
791 for (int a = 0; a < na; a++)
792 for (int p = 0; p < npol; p++)
793 Lakn[a][p * nmo + k] = bnas({0, nchol}, a * npol + p);
794 }
795 for (int a = 0; a < na; a++)
796 ma::transpose(Lakn[a], Lank[a]);
797 }
798
799
800 } // namespace ma_rotate_padded
801
802
803 } // namespace afqmc
804
805 } // namespace qmcplusplus
806
807 #endif
808