1 //
2 // BAGEL - Brilliantly Advanced General Electronic Structure Library
3 // Filename: fci_rdm.cc
4 // Copyright (C) 2011 Toru Shiozaki
5 //
6 // Author: Toru Shiozaki <shiozaki@northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // This program is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU General Public License
22 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 //
24 
25 
26 #include <src/ci/fci/fci.h>
27 #include <src/util/prim_op.h>
28 #include <src/util/math/algo.h>
29 #include <src/wfn/rdm.h>
30 
31 using namespace std;
32 using namespace bagel;
33 
FCI_bare(shared_ptr<const CIWfn> ci)34 FCI_bare::FCI_bare(shared_ptr<const CIWfn> ci) {
35   print_thresh_ = 1.0e-8;
36   nelea_ = ci->det()->nelea();
37   neleb_ = ci->det()->neleb();
38   ncore_ = ci->ncore();
39   norb_  = ci->nact();
40   nstate_ = ci->nstates();
41   energy_ = ci->energies();
42   cc_ = ci->civectors() ? ci->civectors()->copy() : nullptr;
43   det_ = ci->det();
44   rdm1_ = make_shared<VecRDM<1>>();
45   rdm2_ = make_shared<VecRDM<2>>();
46 }
47 
48 
compute_rdm12()49 void FCI::compute_rdm12() {
50   // Needs initialization here because we use daxpy.
51   // For nstate_ == 1, rdm1_av_ = rdm1_->at(0).
52   if (rdm1_av_ == nullptr && nstate_ > 1) {
53     rdm1_av_ = make_shared<RDM<1>>(norb_);
54     rdm2_av_ = make_shared<RDM<2>>(norb_);
55   } else if (nstate_ > 1) {
56     rdm1_av_->zero();
57     rdm2_av_->zero();
58   }
59   // we need expanded lists
60   auto detex = make_shared<Determinants>(norb_, nelea_, neleb_, /*compressed=*/false, /*mute=*/true);
61   cc_->set_det(detex);
62 
63   for (int i = 0; i != nstate_; ++i)
64     compute_rdm12(i, i);
65 
66   // calculate state averaged RDMs
67   if (nstate_ != 1) {
68     for (int ist = 0; ist != nstate_; ++ist) {
69       rdm1_av_->ax_plus_y(weight_[ist], rdm1_->at(ist));
70       rdm2_av_->ax_plus_y(weight_[ist], rdm2_->at(ist));
71     }
72   } else {
73     rdm1_av_ = rdm1_->at(0,0);
74     rdm2_av_ = rdm2_->at(0,0);
75   }
76 
77   cc_->set_det(det_);
78 }
79 
80 
compute_rdm12(const int ist,const int jst)81 void FCI::compute_rdm12(const int ist, const int jst) {
82   if (det_->compress()) {
83     auto detex = make_shared<Determinants>(norb_, nelea_, neleb_, false, /*mute=*/true);
84     cc_->set_det(detex);
85   }
86 
87   shared_ptr<Civec> ccbra = cc_->data(ist);
88   shared_ptr<Civec> ccket = cc_->data(jst);
89 
90   shared_ptr<RDM<1>> rdm1;
91   shared_ptr<RDM<2>> rdm2;
92   tie(rdm1, rdm2) = compute_rdm12_from_civec(ccbra, ccket);
93 
94   // setting to private members.
95   rdm1_->emplace(ist, jst, rdm1);
96   rdm2_->emplace(ist, jst, rdm2);
97 
98   cc_->set_det(det_);
99 }
100 
101 
102 tuple<shared_ptr<RDM<1>>, shared_ptr<RDM<2>>>
compute_rdm12_from_civec(shared_ptr<const Civec> cbra,shared_ptr<const Civec> cket) const103   FCI::compute_rdm12_from_civec(shared_ptr<const Civec> cbra, shared_ptr<const Civec> cket) const {
104 
105   // since we consider here number conserving operators...
106   auto dbra = make_shared<Dvec>(cbra->det(), norb_*norb_);
107   sigma_2a1(cbra, dbra);
108   sigma_2a2(cbra, dbra);
109 
110   shared_ptr<Dvec> dket;
111   // if bra and ket vectors are different, we need to form Sigma for ket as well.
112   if (cbra != cket) {
113     dket = make_shared<Dvec>(cket->det(), norb_*norb_);
114     sigma_2a1(cket, dket);
115     sigma_2a2(cket, dket);
116   } else {
117     dket = dbra;
118   }
119 
120   return compute_rdm12_last_step(dbra, dket, cbra);
121 }
122 
123 
124 tuple<shared_ptr<RDM<1>>, shared_ptr<RDM<2>>>
compute_rdm12_last_step(shared_ptr<const Dvec> dbra,shared_ptr<const Dvec> dket,shared_ptr<const Civec> cibra) const125 FCI::compute_rdm12_last_step(shared_ptr<const Dvec> dbra, shared_ptr<const Dvec> dket, shared_ptr<const Civec> cibra) const {
126 
127   const int nri = cibra->asize()*cibra->lenb();
128 
129   // 1RDM c^dagger <I|\hat{E}|0>
130   // 2RDM \sum_I <0|\hat{E}|I> <I|\hat{E}|0>
131   auto rdm1 = make_shared<RDM<1>>(norb_);
132   auto rdm2 = make_shared<RDM<2>>(norb_);
133   {
134     auto cibra_data = make_shared<VectorB>(nri);
135     copy_n(cibra->data(), nri, cibra_data->data());
136 
137     auto dketv = btas::group(*dket,0,2);
138     auto rdm1t = btas::group(*rdm1,0,2);
139     btas::contract(1.0, dketv, {0,1}, *cibra_data, {0}, 0.0, rdm1t, {1});
140 
141     auto dbrav = btas::group(*dbra,0,2);
142     auto rdm2t = group(group(*rdm2, 2,4), 0,2);
143     btas::contract(1.0, dbrav, {1,0}, dketv, {1,2}, 0.0, rdm2t, {0,2});
144   }
145 
146   // sorting... a bit stupid but cheap anyway
147   // This is since we transpose operator pairs in dgemm - cheaper to do so after dgemm (usually Nconfig >> norb_**2).
148   unique_ptr<double[]> buf(new double[norb_*norb_]);
149   for (int i = 0; i != norb_; ++i) {
150     for (int k = 0; k != norb_; ++k) {
151       copy_n(&rdm2->element(0,0,k,i), norb_*norb_, buf.get());
152       blas::transpose(buf.get(), norb_, norb_, rdm2->element_ptr(0,0,k,i));
153     }
154   }
155 
156   // put in diagonal into 2RDM
157   // Gamma{i+ k+ l j} = Gamma{i+ j k+ l} - delta_jk Gamma{i+ l}
158   for (int i = 0; i != norb_; ++i)
159     for (int k = 0; k != norb_; ++k)
160       for (int j = 0; j != norb_; ++j)
161         rdm2->element(j,k,k,i) -= rdm1->element(j,i);
162 
163   return tie(rdm1, rdm2);
164 }
165 
166 
167 tuple<shared_ptr<RDM<1>>, shared_ptr<RDM<2>>>
compute_rdm12_av_from_dvec(shared_ptr<const Dvec> dbra,shared_ptr<const Dvec> dket,shared_ptr<const Determinants> o) const168 FCI::compute_rdm12_av_from_dvec(shared_ptr<const Dvec> dbra, shared_ptr<const Dvec> dket, shared_ptr<const Determinants> o) const {
169 
170   if (o != nullptr) {
171     dbra->set_det(o);
172     dket->set_det(o);
173   }
174 
175   auto rdm1 = make_shared<RDM<1>>(norb_);
176   auto rdm2 = make_shared<RDM<2>>(norb_);
177 
178   assert(dbra->ij() == dket->ij() && dbra->det() == dket->det());
179 
180   for (int i = 0; i != dbra->ij(); ++i) {
181     shared_ptr<RDM<1>> r1;
182     shared_ptr<RDM<2>> r2;
183     tie(r1, r2) = compute_rdm12_from_civec(dbra->data(i), dket->data(i));
184     rdm1->ax_plus_y(weight_[i], r1);
185     rdm2->ax_plus_y(weight_[i], r2);
186   }
187 
188   if (o != nullptr) {
189     dbra->set_det(det_);
190     dket->set_det(det_);
191   }
192 
193   return tie(rdm1, rdm2);
194 }
195 
196 // computes 3 and 4RDM
rdm34(const int ist,const int jst) const197 tuple<shared_ptr<RDM<3>>, shared_ptr<RDM<4>>> FCI::rdm34(const int ist, const int jst) const {
198   auto rdm3 = make_shared<RDM<3>>(norb_);
199   auto rdm4 = make_shared<RDM<4>>(norb_);
200 
201   auto detex = make_shared<Determinants>(norb_, nelea_, neleb_, false, /*mute=*/true);
202   cc_->set_det(detex);
203 
204   shared_ptr<Civec> cbra = cc_->data(ist);
205   shared_ptr<Civec> cket = cc_->data(jst);
206 
207   // first make <I|E_ij|0>
208   auto dbra = make_shared<Dvec>(cbra->det(), norb_*norb_);
209   sigma_2a1(cbra, dbra);
210   sigma_2a2(cbra, dbra);
211 
212   shared_ptr<Dvec> dket = dbra;
213   if (cbra != cket) {
214     dket = dbra->clone();
215     sigma_2a1(cket, dket);
216     sigma_2a2(cket, dket);
217   }
218 
219   // second make <J|E_kl|I><I|E_ij|0> - delta_il <J|E_kj|0>
220   auto make_evec_half = [this](shared_ptr<Dvec> d, shared_ptr<Matrix> e, const int dsize, const int offset) {
221     const int norb2 = norb_ * norb_;
222     const int lena = cc_->det()->lena();
223     const int lenb = cc_->det()->lenb();
224     int no = 0;
225 
226     for (int ij = 0; ij != norb2; ++ij) {
227       const int j = ij/norb_;
228       const int i = ij-j*norb_;
229 
230       for (int kl = ij; kl != norb2; ++kl) {
231         const int l = kl/norb_;
232         const int k = kl-l*norb_;
233 
234         for (auto& iter : cc_->det()->phia(k,l)) {
235           size_t iaJ = iter.source;
236           size_t iaI = iter.target;
237           double sign = static_cast<double>(iter.sign);
238           for (size_t ib = 0; ib != lenb; ++ib) {
239             size_t iI = ib + iaI*lenb;
240             size_t iJ = ib + iaJ*lenb;
241             if ((iJ - offset) < dsize && iJ >= offset)
242               e->element(iJ-offset, no) += sign * d->data(ij)->data(iI);
243           }
244         }
245 
246         for (size_t ia = 0; ia != lena; ++ia) {
247           for (auto& iter : cc_->det()->phib(k,l)) {
248             size_t ibJ = iter.source;
249             size_t ibI = iter.target;
250             double sign = static_cast<double>(iter.sign);
251             size_t iI = ibI + ia*lenb;
252             size_t iJ = ibJ + ia*lenb;
253             if ((iJ - offset) < dsize && iJ >= offset)
254               e->element(iJ-offset, no) += sign * d->data(ij)->data(iI);
255           }
256         }
257 
258         if (i == l) {
259           const int kj = k+j*norb_;
260           for (size_t iJ = offset; iJ != offset+dsize; ++iJ) {
261             e->element(iJ-offset, no) -= d->data(kj)->data(iJ);
262           }
263         }
264         ++no;
265       }
266     }
267   };
268 
269   // RDM3, RDM4 construction is multipassed and parallelized:
270   //  (1) When ndet > 10000, (ndet < 10000 -> too small, almost no gain)
271   //  and (2) When we have processes more than one
272   //  OR  (3) When the number of words in <I|E_ij,kl|0> is larger than (10,10) case (635,040,000)
273   const size_t ndet = cbra->det()->size();
274   const size_t norb2 = norb_ * norb_;
275   const size_t ijmax = 635040001 * 2;
276   const size_t ijnum = ndet * norb2 * norb2;
277   const size_t npass = ((mpi__->size() * 2 > ((ijnum-1)/ijmax + 1)) && (mpi__->size() != 1) && ndet > 10000) ? mpi__->size() * 2 : (ijnum-1) / ijmax + 1;
278   const size_t nsize = (ndet-1) / npass + 1;
279   Timer timer;
280   if (npass > 1) {
281     cout << "    * Third and fourth order RDM (" << setw(2) << ist + 1 << "," << setw(2) << jst + 1 << ") evaluation" << endl;
282     cout << "      will be done with " << npass << " passes" << endl;
283   }
284 
285   rdm3->zero();
286   rdm4->zero();
287 
288   // multipassing through {I}
289   for (size_t ipass = 0; ipass != npass; ++ipass) {
290     if (ipass % mpi__->size() != mpi__->rank()) continue;
291 
292     const size_t ioffset = ipass * nsize;
293     const size_t isize = (ipass != (npass - 1)) ? nsize : ndet - ioffset;
294     const size_t halfsize = norb2 * (norb2 + 1) / 2;
295     auto eket_half = make_shared<Matrix>(isize, halfsize, /*local=*/true);
296     make_evec_half(dket, eket_half, isize, ioffset);
297 
298     auto dbram = make_shared<Matrix>(isize, norb2, /*local=*/true);
299     for (size_t ij = 0; ij != norb2; ++ij)
300       copy_n(&(dbra->data(ij)->data(ioffset)), isize, dbram->element_ptr(0, ij));
301 
302     // put in third-order RDM: <0|E_mn|I><I|E_ij < kl|0>
303     auto tmp3 = make_shared<Matrix>(*dbram % *eket_half);
304     auto tmp3_full = make_shared<Matrix>(norb2, norb2 * norb2, /*local=*/true);
305 
306     for (size_t mn = 0; mn != norb2; ++mn) {
307       int no = 0;
308       for (size_t ij = 0; ij != norb2; ++ij) {
309         for (size_t kl = ij; kl != norb2; ++kl) {
310           int ijkl = ij + kl*norb2;
311           int klij = kl + ij*norb2;
312           tmp3_full->element(mn, ijkl) = tmp3->element(mn, no);
313           tmp3_full->element(mn, klij) = tmp3->element(mn, no);
314           ++no;
315         }
316       }
317     }
318     sort_indices<1,0,2,1,1,1,1>(tmp3_full->data(), rdm3->data(), norb_, norb_, norb2*norb2);
319 
320     // put in fourth-order RDM: <0|E_ij > kl|I><I|E_mn < op|0>
321     shared_ptr<Matrix> ebra_half = eket_half;
322     if (cbra != cket) {
323       ebra_half = eket_half->clone();
324       make_evec_half(dbra, ebra_half, isize, ioffset);
325     }
326     auto tmp4 = make_shared<Matrix>(*ebra_half % *eket_half);
327     auto tmp4_full = make_shared<Matrix>(norb2 * norb2, norb2 * norb2, /*local=*/true);
328 
329     {
330       int nklij = 0;
331       for (size_t kl = 0; kl != norb2; ++kl) {
332         for (size_t ij = kl; ij != norb2; ++ij) {
333           int nmnop = 0;
334           int klij = kl+ij*norb2;
335           int ijkl = ij+kl*norb2;
336           for (size_t mn = 0; mn != norb2; ++mn) {
337             for (size_t op = mn; op != norb2; ++op) {
338               int mnop = mn + op*norb2;
339               int opmn = op + mn*norb2;
340               tmp4_full->element(ijkl, mnop) = tmp4->element(nklij, nmnop);
341               tmp4_full->element(klij, mnop) = tmp4->element(nklij, nmnop);
342               tmp4_full->element(ijkl, opmn) = tmp4->element(nklij, nmnop);
343               tmp4_full->element(klij, opmn) = tmp4->element(nklij, nmnop);
344               ++nmnop;
345             }
346           }
347           ++nklij;
348         }
349       }
350     }
351     sort_indices<1,0,3,2,4,1,1,1,1>(tmp4_full->data(), rdm4->data(), norb_, norb_, norb_, norb_, norb2*norb2);
352   }
353 
354   rdm3->allreduce();
355   rdm4->allreduce();
356 
357   if (npass > 1) {
358     timer.tick_print("RDM evaluation (multipassing)");
359   }
360 
361   // The remaining terms can be evaluated without multipassing
362   {
363     // then perform Eq. 49 of JCP 89 5803 (Werner's MRCI paper)
364     // we assume that rdm2_[ist] is set
365     for (int i0 = 0; i0 != norb_; ++i0)
366       for (int i1 = 0; i1 != norb_; ++i1)
367         for (int i2 = 0; i2 != norb_; ++i2)
368           for (int i3 = 0; i3 != norb_; ++i3) {
369             blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0, i2, i1, i0), norb_, rdm3->element_ptr(0, i3, i3, i2, i1, i0));
370             blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0, i0, i3, i2), norb_, rdm3->element_ptr(0, i1, i3, i2, i1, i0));
371           }
372   }
373 
374   {
375     for (int l = 0; l != norb_; ++l)
376       for (int k = 0; k != norb_; ++k)
377         for (int j = 0; j != norb_; ++j)
378           for (int b = 0; b != norb_; ++b) {
379             blas::ax_plus_y_n(-1.0, rdm3->element_ptr(0,0,0,k,b,l), norb_*norb_*norb_, rdm4->element_ptr(0,0,0,j,j,k,b,l));
380             blas::ax_plus_y_n(-1.0, rdm3->element_ptr(0,0,0,l,b,k), norb_*norb_*norb_, rdm4->element_ptr(0,0,0,j,b,k,j,l));
381             for (int i = 0; i != norb_; ++i) {
382               blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0,k,b,l), norb_, rdm4->element_ptr(0,i,b,j,i,k,j,l));
383               blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0,l,b,k), norb_, rdm4->element_ptr(0,i,b,j,j,k,i,l));
384               for (int d = 0; d != norb_; ++d) {
385                 blas::ax_plus_y_n(-1.0, rdm3->element_ptr(0,k,b,j,d,l), norb_, rdm4->element_ptr(0,i,b,j,i,k,d,l));
386                 blas::ax_plus_y_n(-1.0, rdm3->element_ptr(0,l,b,j,d,k), norb_, rdm4->element_ptr(0,i,b,j,d,k,i,l));
387               }
388             }
389           }
390   }
391 
392   cc_->set_det(det_);
393 
394   return make_tuple(rdm3, rdm4);
395 }
396 
397 // computes 3 and Fock-weighted 4RDM
rdm34f(const int ist,const int jst,shared_ptr<const Matrix> fock) const398 tuple<shared_ptr<RDM<3>>, shared_ptr<RDM<3>>> FCI::rdm34f(const int ist, const int jst, shared_ptr<const Matrix> fock) const {
399   auto rdm3 = make_shared<RDM<3>>(norb_);
400   auto rdm4f = make_shared<RDM<3>>(norb_);
401 
402   auto detex = make_shared<Determinants>(norb_, nelea_, neleb_, false, /*mute=*/true);
403   cc_->set_det(detex);
404 
405   shared_ptr<Civec> cbra = cc_->data(ist);
406   shared_ptr<Civec> cket = cc_->data(jst);
407 
408   // first make <I|E_ij|0>
409   auto dbra = make_shared<Dvec>(cbra->det(), norb_*norb_);
410   sigma_2a1(cbra, dbra);
411   sigma_2a2(cbra, dbra);
412 
413   shared_ptr<Dvec> dket = dbra;
414   if (cbra != cket) {
415     dket = dbra->clone();
416     sigma_2a1(cket, dket);
417     sigma_2a2(cket, dket);
418   }
419 
420   // second make <J|E_kl|I><I|E_ij|0> - delta_il <J|E_kj|0>
421   auto make_evec_half = [this](shared_ptr<Dvec> d, shared_ptr<Matrix> e, const int dsize, const int offset) {
422     const int norb2 = norb_ * norb_;
423     const int lena = cc_->det()->lena();
424     const int lenb = cc_->det()->lenb();
425     int no = 0;
426 
427     for (int ij = 0; ij != norb2; ++ij) {
428       const int j = ij/norb_;
429       const int i = ij-j*norb_;
430 
431       for (int kl = ij; kl != norb2; ++kl) {
432         const int l = kl/norb_;
433         const int k = kl-l*norb_;
434 
435         for (auto& iter : cc_->det()->phia(k,l)) {
436           size_t iaJ = iter.source;
437           size_t iaI = iter.target;
438           double sign = static_cast<double>(iter.sign);
439           for (size_t ib = 0; ib != lenb; ++ib) {
440             size_t iI = ib + iaI*lenb;
441             size_t iJ = ib + iaJ*lenb;
442             if ((iJ - offset) < dsize && iJ >= offset)
443               e->element(iJ-offset, no) += sign * d->data(ij)->data(iI);
444           }
445         }
446 
447         for (size_t ia = 0; ia != lena; ++ia) {
448           for (auto& iter : cc_->det()->phib(k,l)) {
449             size_t ibJ = iter.source;
450             size_t ibI = iter.target;
451             double sign = static_cast<double>(iter.sign);
452             size_t iI = ibI + ia*lenb;
453             size_t iJ = ibJ + ia*lenb;
454             if ((iJ - offset) < dsize && iJ >= offset)
455               e->element(iJ-offset, no) += sign * d->data(ij)->data(iI);
456           }
457         }
458 
459         if (i == l) {
460           const int kj = k+j*norb_;
461           for (size_t iJ = offset; iJ != offset+dsize; ++iJ) {
462             e->element(iJ-offset, no) -= d->data(kj)->data(iJ);
463           }
464         }
465         ++no;
466       }
467     }
468   };
469 
470   // RDM3, RDM4 construction is multipassed and parallelized:
471   //  (1) When ndet > 10000, (ndet < 10000 -> too small, almost no gain)
472   //  and (2) When we have processes more than one
473   //  OR  (3) When the number of words in <I|E_ij,kl|0> is larger than (10,10) case (635,040,000)
474   const size_t ndet = cbra->det()->size();
475   const size_t norb2 = norb_ * norb_;
476   const size_t ijmax = 635040001 * 2;
477   const size_t ijnum = ndet * norb2 * norb2;
478   const size_t npass = ((mpi__->size() * 2 > ((ijnum-1)/ijmax + 1)) && (mpi__->size() != 1) && ndet > 10000) ? mpi__->size() * 2 : (ijnum-1) / ijmax + 1;
479   const size_t nsize = (ndet-1) / npass + 1;
480   Timer timer;
481   if (npass > 1) {
482     cout << "    * Third and Fock-weighted fourth order RDM (" << setw(2) << ist + 1 << "," << setw(2) << jst + 1 << ") evaluation" << endl;
483     cout << "      will be done with " << npass << " passes" << endl;
484   }
485 
486   rdm3->zero();
487   rdm4f->zero();
488 
489   // multipassing through {I}
490   for (size_t ipass = 0; ipass != npass; ++ipass) {
491     if (ipass % mpi__->size() != mpi__->rank()) continue;
492 
493     const size_t ioffset = ipass * nsize;
494     const size_t isize = (ipass != (npass - 1)) ? nsize : ndet - ioffset;
495     const size_t halfsize = norb2 * (norb2 + 1) / 2;
496     auto eket_half = make_shared<Matrix>(isize, halfsize, /*local=*/true);
497     make_evec_half(dket, eket_half, isize, ioffset);
498     shared_ptr<Matrix> ebra_half = eket_half;
499     if (cbra != cket) {
500       ebra_half = eket_half->clone();
501       make_evec_half(dbra, ebra_half, isize, ioffset);
502     }
503 
504     auto feket = make_shared<Matrix>(isize, norb2, /*local=*/true);
505     feket->zero();
506     {
507       auto eket_full = make_shared<Matrix>(isize, norb2 * norb2, /*local=*/true);
508       size_t no = 0;
509       for (size_t kl = 0; kl != norb2; ++kl)
510         for (size_t ij = kl; ij != norb2; ++ij) {
511           size_t ijkl = ij + kl*norb2;
512           size_t klij = kl + ij*norb2;
513           for (size_t iI = 0; iI != isize; ++iI) {
514             eket_full->element(iI, ijkl) = eket_half->element(iI, no);
515             eket_full->element(iI, klij) = eket_half->element(iI, no);
516           }
517           ++no;
518         }
519       dgemv_("N", isize*norb_*norb_, norb_*norb_, 1.0, eket_full->data(), isize*norb_*norb_, fock->data(), 1, 0.0, feket->data(), 1);
520     }
521 
522     auto dbram = make_shared<Matrix>(isize, norb2, /*local=*/true);
523     for (size_t ij = 0; ij != norb2; ++ij)
524       copy_n(&(dbra->data(ij)->data(ioffset)), isize, dbram->element_ptr(0, ij));
525 
526     // put in third-order RDM: <0|E_mn|I><I|E_ij < kl|0>
527     auto tmp3 = make_shared<Matrix>(*dbram % *eket_half);
528     auto tmp3_full = make_shared<Matrix>(norb2, norb2 * norb2, /*local=*/true);
529     // <0|E_ij > kl|I>[I|mn|0]
530     auto tmp4 = make_shared<Matrix>(*ebra_half % *feket);
531     auto tmp4_full = make_shared<Matrix>(norb2 * norb2, norb2, /*local=*/true);
532 
533     for (size_t mn = 0; mn != norb2; ++mn) {
534       size_t no = 0;
535       for (size_t kl = 0; kl != norb2; ++kl) {
536         for (size_t ij = kl; ij != norb2; ++ij) {
537           size_t ijkl = ij + kl*norb2;
538           size_t klij = kl + ij*norb2;
539           tmp3_full->element(mn, ijkl) = tmp3->element(mn, no);
540           tmp3_full->element(mn, klij) = tmp3->element(mn, no);
541           tmp4_full->element(ijkl, mn) = tmp4->element(no, mn);
542           tmp4_full->element(klij, mn) = tmp4->element(no, mn);
543           ++no;
544         }
545       }
546     }
547     sort_indices<1,0,2,1,1,1,1>(tmp3_full->data(), rdm3->data(), norb_, norb_, norb2*norb2);
548     sort_indices<1,0,3,2,4,1,1,1,1>(tmp4_full->data(), rdm4f->data(), norb_, norb_, norb_, norb_, norb2);
549   }
550 
551   rdm3->allreduce();
552   rdm4f->allreduce();
553 
554   if (npass > 1) {
555     timer.tick_print("RDM evaluation (multipassing)");
556   }
557 
558   {
559     // then perform Eq. 49 of JCP 89 5803 (Werner's MRCI paper)
560     // we assume that rdm2_[ist] is set
561     for (int i0 = 0; i0 != norb_; ++i0)
562       for (int i1 = 0; i1 != norb_; ++i1)
563         for (int i2 = 0; i2 != norb_; ++i2)
564           for (int i3 = 0; i3 != norb_; ++i3) {
565             blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0, i2, i1, i0), norb_, rdm3->element_ptr(0, i3, i3, i2, i1, i0));
566             blas::ax_plus_y_n(-1.0, rdm2_->at(ist, jst)->element_ptr(0, i0, i3, i2), norb_, rdm3->element_ptr(0, i1, i3, i2, i1, i0));
567           }
568   }
569 
570   {
571     // [0|E_ip,jl|0]
572     auto frdm3 = make_shared<RDM<2>>(norb_);
573     auto rdm3view = group(group(*rdm3, 4,6), 0,4);
574     auto frdm3view = group(*frdm3, 0,4);
575     contract(1.0, rdm3view, {0,1}, group(*fock, 0,2), {1}, 0.0, frdm3view, {0});
576 
577     // <0|E_ip,jn|0>f_nl (in this order)
578     auto prdm2 = make_shared<RDM<2>>(norb_);
579     auto prdm2v = group(*prdm2, 0,3);
580     contract(1.0, group(*rdm2_->at(ist, jst), 0,3), {0,1}, *fock, {1,2}, 0.0, prdm2v, {0,2});
581 
582     // <0|E_ik,jl,on|0>f_np (in this order)
583     auto prdm3 = make_shared<RDM<3>>(norb_);
584     auto prdm3v = group(*prdm3, 0,5);
585     contract(1.0, group(*rdm3, 0,5), {0,1}, *fock, {1,2}, 0.0, prdm3v, {0,2});
586 
587     sort_indices<2,0,1,1,1,-1,1>(prdm3->data(), rdm4f->data(), norb_*norb_, norb_*norb_, norb_*norb_);
588     sort_indices<0,2,1,1,1,-1,1>(prdm3->data(), rdm4f->data(), norb_*norb_, norb_*norb_, norb_*norb_);
589 
590     auto prdm2t = prdm2->clone();
591     sort_indices<1,0,0,1,1,1>(prdm2->data(), prdm2t->data(), norb_*norb_, norb_*norb_);
592     for (int p = 0; p != norb_; ++p)
593       for (int l = 0; l != norb_; ++l) {
594         blas::ax_plus_y_n(-1.0, prdm2t->element_ptr(0,0,0,p), norb_*norb_*norb_, rdm4f->element_ptr(0,0,0,l,l,p));
595         blas::ax_plus_y_n(-1.0, frdm3->element_ptr(0,0,0,p),  norb_*norb_*norb_, rdm4f->element_ptr(0,0,0,l,l,p));
596         for (int k = 0; k != norb_; ++k)
597           for (int j = 0; j != norb_; ++j) {
598             blas::ax_plus_y_n(-1.0, prdm2->element_ptr(0,p,j,l), norb_, rdm4f->element_ptr(0,k,j,l,k,p));
599             blas::ax_plus_y_n(-1.0, frdm3->element_ptr(0,p,j,l), norb_, rdm4f->element_ptr(0,k,j,l,k,p));
600           }
601       }
602   }
603 
604   cc_->set_det(det_);
605 
606   return make_tuple(rdm3, rdm4f);
607 }
608