1 //
2 // BAGEL - Parallel electron correlation program.
3 // Filename: box.cc
4 // Copyright (C) 2016 Toru Shiozaki
5 //
6 // Author: Hai-Anh Le <anh@u.northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // The BAGEL package is free software; you can redistribute it and/or modify
12 // it under the terms of the GNU Library General Public License as published by
13 // the Free Software Foundation; either version 3, or (at your option)
14 // any later version.
15 //
16 // The BAGEL package is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 // GNU Library General Public License for more details.
20 //
21 // You should have received a copy of the GNU Library General Public License
22 // along with the BAGEL package; see COPYING.  If not, write to
23 // the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24 //
25 
26 
27 #include <mutex>
28 #include <src/scf/fmm/box.h>
29 #include <src/integral/os/multipolebatch.h>
30 #include <src/integral/rys/eribatch.h>
31 #include <src/util/taskqueue.h>
32 #include <src/util/math/factorial.h>
33 #include <src/util/math/legendre.h>
34 
35 using namespace bagel;
36 using namespace std;
37 
38 const static Legendre plm;
39 const static Factorial f;
40 
init()41 void Box::init() {
42 
43   nchild_ = child_.size();
44   centre_ = {{0, 0, 0}};
45   extent_ = 0.0;
46   nsp_ = 0;
47   for (auto& i : sp_) {
48     if (i->schwarz() < schwarz_thresh_) continue;
49     ++nsp_;
50   }
51 
52   if (nchild_ == 0) {
53     for (auto& i : sp_) {
54       if (i->schwarz() < schwarz_thresh_) continue;
55       for (int j = 0; j != 3; ++j) centre_[j] += i->centre(j);
56     }
57     centre_[0] /= nsp_;
58     centre_[1] /= nsp_;
59     centre_[2] /= nsp_;
60     for (auto& i : sp_) {
61       if (i->schwarz() < schwarz_thresh_) continue;
62       double rad = 0;
63       for (int j = 0; j != 3; ++j) rad += pow(i->centre(j)-centre_[j], 2.0);
64       const double ei = sqrt(rad) + i->extent();
65       assert(ei > 0);
66       extent_ = max(extent_, ei);
67     }
68   } else {
69     for (int n = 0; n != nchild_; ++n) {
70       shared_ptr<const Box> i = child_[n].lock();
71       for (int j = 0; j != 3; ++j)  centre_[j] += i->centre(j);
72     }
73     centre_[0] /= nchild_;
74     centre_[1] /= nchild_;
75     centre_[2] /= nchild_;
76     for (int n = 0; n != nchild_; ++n) {
77       shared_ptr<const Box> i = child_[n].lock();
78       double rad = 0.0;
79       for (int j = 0; j != 3; ++j) rad += pow(i->centre(j)-centre_[j], 2.0);
80       const double ei = sqrt(rad) + i->extent();
81       assert(ei > 0);
82       extent_ = max(extent_, ei);
83     }
84   }
85 
86   ws_ = static_cast<int>(ceil(extent_/boxsize_));
87   nmult_ = (lmax_ + 1) * (lmax_ + 1);
88   olm_ = make_shared<ZVectorB>(nmult_);
89   mlm_ = make_shared<ZVectorB>(nmult_);
90 
91   nshell0_ = 0;
92   int cnt = 0;
93   for (auto& i : sp_) {
94     auto it = shell0_.insert(make_pair(i->shell(0), make_tuple(nshell0_, i->offset(0), cnt)));
95     if (it.second) {
96       nshell0_ += i->shell(0)->nbasis();
97       ++cnt;
98     }
99   }
100 }
101 
102 
insert_sp(const vector<shared_ptr<const ShellPair>> & sp)103 void Box::insert_sp(const vector<shared_ptr<const ShellPair>>& sp) {
104 
105   const int nsp = sp_.size();
106   sp_.resize(nsp + sp.size());
107   for (int i = 0; i != sp.size(); ++i)
108     sp_[i + nsp] = sp[i];
109 
110 }
111 
112 
insert_child(weak_ptr<const Box> c)113 void Box::insert_child(weak_ptr<const Box> c) {
114 
115   const int nchild = child_.size();
116   shared_ptr<const Box> child = c.lock();
117   if (child) {
118     child_.resize(nchild + 1);
119     child_[nchild] = child;
120   }
121 }
122 
123 
insert_parent(weak_ptr<const Box> p)124 void Box::insert_parent(weak_ptr<const Box> p) {
125 
126   parent_ = p;
127 }
128 
129 
get_neigh(const vector<weak_ptr<Box>> & box,const double ws)130 void Box::get_neigh(const vector<weak_ptr<Box>>& box, const double ws) {
131 
132   shared_ptr<const Box> box0 = box[0].lock();
133   if ((box.empty()) || (box0->rank() != rank_)) return;
134 
135   neigh_.resize(box.size());
136   nonneigh_.resize(box.size());
137   int nn = 0;
138   int nnn = 0;
139   for (int i = 0; i != box.size(); ++i) {
140     shared_ptr<const Box> b = box[i].lock();
141     if (is_neigh(b, ws)) {
142       neigh_[nn] = b;
143       ++nn;
144     } else {
145       nonneigh_[nnn] = b;
146       ++nnn;
147     }
148   }
149   neigh_.resize(nn);
150   nonneigh_.resize(nnn);
151   nneigh_ = neigh_.size();
152 }
153 
154 
is_neigh(weak_ptr<const Box> b,const double ws) const155 bool Box::is_neigh(weak_ptr<const Box> b, const double ws) const {
156 
157   shared_ptr<const Box> box = b.lock();
158   if (nchild_ == 0) {
159     double rr = 0;
160     for (int i = 0; i != 3; ++i) rr += pow(centre_[i] - box->centre(i), 2);
161     return (sqrt(rr) <= (1.0+ws)*(extent_ + box->extent()));
162   } else {
163     for (int n0 = 0; n0 != nchild_; ++n0) {
164       shared_ptr<const Box> i = child_[n0].lock();
165       for (int n1 = 0; n1 != box->child().size(); ++n1) {
166         shared_ptr<const Box> j = box->child()[n1].lock();
167         if (i->is_neigh(j, ws)) return true;
168       }
169     }
170     return false;
171   }
172 }
173 
get_inter(const vector<weak_ptr<Box>> & box,const double ws)174 void Box::get_inter(const vector<weak_ptr<Box>>& box, const double ws) {
175 
176   shared_ptr<const Box> box0 = box[0].lock();
177   if ((box.empty()) || (box0->rank() != rank_)) return;
178 
179   shared_ptr<const Box> parent = parent_.lock();
180   inter_.resize(box.size());
181   int ni = 0;
182   for (int i = 0; i != box.size(); ++i) {
183     shared_ptr<const Box> b = box[i].lock();
184     if ((!is_neigh(b, ws)) && (!parent || parent->is_neigh(b->parent(), ws))) {
185       inter_[ni] = b;
186       ++ni;
187     }
188   }
189   inter_.resize(ni);
190   ninter_ = inter_.size();
191 }
192 
193 
sort_sp()194 void Box::sort_sp() {
195 
196   if (nchild_ == 0) return;
197 
198   assert(nchild_ > 0);
199   vector<shared_ptr<const ShellPair>> newsp;
200   for (int n = 0; n != nchild_; ++n) {
201     shared_ptr<const Box> c = child_[n].lock();
202     const vector<shared_ptr<const ShellPair>>& csp = c->sp();
203     newsp.insert(newsp.end(), csp.begin(), csp.end());
204   }
205   assert(newsp.size() == sp_.size());
206   sp_ = newsp;
207 }
208 
209 
compute_M2M(shared_ptr<const Matrix> density)210 void Box::compute_M2M(shared_ptr<const Matrix> density) {
211 
212   olm_->fill(0.0);
213 
214   if (nchild_ == 0) { // leaf
215     TaskQueue<function<void(void)>> tasks(nsp_);
216     mutex jmutex;
217     for (auto& v : sp_) {
218       if (v->schwarz() < schwarz_thresh_) continue;
219 
220       tasks.emplace_back(
221         [this, &v, &density, &jmutex]() {
222           auto olm = make_shared<ZVectorB>(nmult_);
223           MultipoleBatch mpole(v->shells(), centre_, lmax_);
224           mpole.compute();
225           const int dimb0 = v->shell(0)->nbasis();
226           const int dimb1 = v->shell(1)->nbasis();
227 
228           for (int k = 0; k != mpole.num_blocks(); ++k) {
229             size_t cnt = 0;
230             for (int i = v->offset(1); i != dimb1 + v->offset(1); ++i) {
231               (*olm)(k) += conj(blas::dot_product_noconj(mpole.data() + mpole.size_block()*k + cnt, dimb0, density->element_ptr(v->offset(0), i)));
232               cnt += dimb0;
233             }
234           }
235 
236           lock_guard<mutex> lock(jmutex);
237           blas::ax_plus_y_n(1.0, olm->data(), nmult_, olm_->data());
238         }
239       );
240     }
241     tasks.compute();
242   } else { // shift children's multipoles
243     for (int n = 0; n != nchild_; ++n) {
244       shared_ptr<const Box> c = child_[n].lock();
245       const array<double, 3> r12 = {{c->centre(0) - centre_[0], c->centre(1) - centre_[1], c->centre(2) - centre_[2]}};
246       auto cmultipole = make_shared<ZMatrix>(1, nmult_);
247       copy_n(c->olm()->data(), nmult_, cmultipole->data());
248       shared_ptr<const ZMatrix> tmp = shift_multipolesX(lmax_, cmultipole, r12);
249       auto smoment = make_shared<ZVectorB>(nmult_);
250       copy_n(tmp->data(), nmult_, smoment->data());
251       blas::ax_plus_y_n(1.0, smoment->data(), nmult_, olm_->data());
252     }
253   }
254 }
255 
256 
257 //|su) C_ui C_sj
compute_M2M_X(shared_ptr<const Matrix> ocoeff_sj,shared_ptr<const Matrix> ocoeff_ui)258 void Box::compute_M2M_X(shared_ptr<const Matrix> ocoeff_sj, shared_ptr<const Matrix> ocoeff_ui) {
259 
260   const int nmult_k = (lmax_k_+1)*(lmax_k_+1);
261   olm_ndim_ = ocoeff_sj->mdim();  olm_mdim_ = ocoeff_ui->mdim();
262   olm_ji_ = make_shared<ZMatrix>(olm_ndim_*olm_mdim_, nmult_k);
263 
264   if (nchild_ == 0) { // leaf
265     ZMatrix interm(nshell0_, ocoeff_ui->mdim()*nmult_k, true);
266     TaskQueue<function<void(void)>> tasks(nsp_);
267     vector<mutex> mmutex(shell0_.size());
268 
269     for (auto& v : sp_) {
270       if (v->schwarz() < schwarz_thresh_) continue;
271 
272       tasks.emplace_back(
273         [this, v, ocoeff_ui, &mmutex, nmult_k, &interm]() {
274           MultipoleBatch olm_su(v->shells(), centre_, lmax_k_);
275           olm_su.compute();
276           const int dimb0 = v->shell(0)->nbasis();
277           const int dimb1 = v->shell(1)->nbasis();
278           const ZMatrix zui(*ocoeff_ui->cut(v->offset(1), v->offset(1)+dimb1), 1.0);
279           unique_ptr<complex<double>[]> tmp(new complex<double>[dimb0*ocoeff_ui->mdim()*nmult_k]);
280           for (int k = 0; k != nmult_k; ++k)
281             zgemm_("N", "N", dimb0, ocoeff_ui->mdim(), dimb1, 1.0, olm_su.data() + olm_su.size_block()*k, dimb0, zui.data(), dimb1,
282                                                               0.0, tmp.get()+dimb0*ocoeff_ui->mdim()*k, dimb0);
283           lock_guard<mutex> lock(mmutex[get<2>(shell0_.at(v->shell(0)))]);
284           const int off = get<0>(shell0_.at(v->shell(0)));
285           interm.add_block(1.0, off, 0, dimb0, ocoeff_ui->mdim()*nmult_k, tmp.get());
286         }
287       );
288     }
289     tasks.compute();
290     blas::conj_n(interm.data(), interm.size());
291 
292     ZMatrix icoeff(nshell0_, ocoeff_sj->mdim(), true);
293     for (auto& i : shell0_) {
294       const int local_offset = get<0>(i.second);
295       const int all_offset = get<1>(i.second);
296       const int size = i.first->nbasis();
297       for (int m = 0; m != ocoeff_sj->mdim(); ++m)
298         for (int n = 0; n != size; ++n)
299           icoeff(n+local_offset, m) = ocoeff_sj->element(n+all_offset, m);
300     }
301     zgemm_("C", "N", icoeff.mdim(), interm.mdim(), nshell0_, 1.0, icoeff.data(), nshell0_, interm.data(), nshell0_, 0.0, olm_ji_->data(), icoeff.mdim());
302 
303   } else { // shift children's multipoles
304     for (int n = 0; n != nchild_; ++n) {
305       shared_ptr<const Box> c = child_[n].lock();
306       const array<double, 3> r12 = {{c->centre(0) - centre_[0], c->centre(1) - centre_[1], c->centre(2) - centre_[2]}};
307       shared_ptr<const ZMatrix> smoment = shift_multipolesX(lmax_k_, c->olm_ji(), r12);
308       blas::ax_plus_y_n(1.0, smoment->data(), olm_ji_->size(), olm_ji_->data());
309     }
310   }
311 }
312 
313 
compute_L2L_X()314 void Box::compute_L2L_X() {
315 
316   // from parent
317   shared_ptr<const Box> parent = parent_.lock();
318   if (parent) {
319     const array<double, 3> r12 = {{centre_[0] - parent->centre(0), centre_[1] - parent->centre(1), centre_[2] - parent->centre(2)}};
320     shared_ptr<const ZMatrix> slocal = shift_localLX(lmax_k_, parent->mlm_ji(), r12);
321     blas::ax_plus_y_n(1.0, slocal->data(), mlm_ji_->size(), mlm_ji_->data());
322   }
323 }
324 
325 
compute_L2L()326 void Box::compute_L2L() {
327 
328   // from parent
329   shared_ptr<const Box> parent = parent_.lock();
330   if (parent) {
331     const array<double, 3> r12 = {{centre_[0] - parent->centre(0), centre_[1] - parent->centre(1), centre_[2] - parent->centre(2)}};
332     auto plocalJ = make_shared<ZMatrix>(1, nmult_);
333     copy_n(parent->mlm()->data(), nmult_, plocalJ->data());
334     shared_ptr<const ZMatrix> tmp = shift_localLX(lmax_, plocalJ, r12);
335     auto slocal = make_shared<ZVectorB>(nmult_);
336     copy_n(tmp->data(), nmult_, slocal->data());
337     blas::ax_plus_y_n(1.0, slocal->data(), nmult_, mlm_->data());
338   }
339 }
340 
341 
compute_M2L_X()342 void Box::compute_M2L_X() {
343 
344   mlm_ji_ = olm_ji_->clone();
345 
346   // from interaction list
347   for (int i = 0; i != ninter_; ++i) {
348     shared_ptr<const Box> it = inter_[i].lock();
349     const array<double, 3> r12 = {{centre_[0] - it->centre(0), centre_[1] - it->centre(1), centre_[2] - it->centre(2)}};
350     shared_ptr<const ZMatrix> slocal = shift_localMX(lmax_k_, it->olm_ji(), r12);
351     blas::ax_plus_y_n(1.0, slocal->data(), mlm_ji_->size(), mlm_ji_->data());
352   }
353 }
354 
355 
compute_M2L()356 void Box::compute_M2L() {
357 
358   mlm_->fill(0.0);
359   // from interaction list
360   for (int i = 0; i != ninter_; ++i) {
361     shared_ptr<const Box> it = inter_[i].lock();
362     const array<double, 3> r12 = {{centre_[0] - it->centre(0), centre_[1] - it->centre(1), centre_[2] - it->centre(2)}};
363     auto imultipole = make_shared<ZMatrix>(1, nmult_);
364     copy_n(it->olm()->data(), nmult_, imultipole->data());
365     shared_ptr<const ZMatrix> tmp = shift_localMX(lmax_, imultipole, r12);
366     auto slocal = make_shared<ZVectorB>(nmult_);
367     copy_n(tmp->data(), nmult_, slocal->data());
368     blas::ax_plus_y_n(1.0, slocal->data(), nmult_, mlm_->data());
369   }
370 }
371 
372 
compute_Fock_nf(shared_ptr<const Matrix> density,shared_ptr<const VectorB> max_den) const373 shared_ptr<const Matrix> Box::compute_Fock_nf(shared_ptr<const Matrix> density, shared_ptr<const VectorB> max_den) const {
374 
375   assert(nchild_ == 0);
376 
377   auto out = make_shared<Matrix>(density->ndim(), density->mdim());
378   out->zero();
379 
380   // NF: 4c integrals
381   const int shift = sizeof(int) * 4;
382   const int nsh = sqrt(max_den->size());
383   assert (nsh*nsh == max_den->size());
384 
385   int ntask = 0;
386   for (auto& v01 : sp_) {
387     if (v01->schwarz() < schwarz_thresh_) continue;
388     const int i0 = v01->shell_ind(1);
389 
390     const int i1 = v01->shell_ind(0);
391     if (i1 < i0) continue;
392 
393     const int i01 = i0 * nsh + i1;
394     const double density_01 = (*max_den)(i01) * 4.0;
395 
396     for (int n = 0; n != nneigh_; ++n) {
397       shared_ptr<const Box> neigh = neigh_[n].lock();
398       for (auto& v23 : neigh->sp()) {
399         if (v23->schwarz() < schwarz_thresh_) continue;
400         const int i2 = v23->shell_ind(1);
401         if (i2 < i0) continue;
402         const int i3 = v23->shell_ind(0);
403         if (i3 < i2) continue;
404         const int i23 = i2 * nsh + i3;
405         if (i23 < i01) continue;
406 
407         const double density_23 = (*max_den)(i23) * 4.0;
408         const double density_02 = (*max_den)(i0 * nsh + i2);
409         const double density_03 = (*max_den)(i0 * nsh + i3);
410         const double density_12 = (*max_den)(i1 * nsh + i2);
411         const double density_13 = (*max_den)(i1 * nsh + i3);
412 
413         const double mulfactor = max(max(max(density_01, density_02),
414                                          max(density_03, density_12)),
415                                          max(density_13, density_23));
416         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
417         const bool skip_schwarz = integral_bound < schwarz_thresh_;
418         if (skip_schwarz) continue;
419         ++ntask;
420       }
421     }
422   }
423 
424   TaskQueue<function<void(void)>> tasks(ntask);
425   mutex jmutex;
426 
427   for (auto& v01 : sp_) {
428     if (v01->schwarz() < schwarz_thresh_) continue;
429     shared_ptr<const Shell> b0 = v01->shell(1);
430     const int i0 = v01->shell_ind(1);
431     const int b0offset = v01->offset(1);
432     const int b0size = b0->nbasis();
433 
434     shared_ptr<const Shell> b1 = v01->shell(0);
435     const int i1 = v01->shell_ind(0);
436     if (i1 < i0) continue;
437     const int b1offset = v01->offset(0);
438     const int b1size = b1->nbasis();
439 
440     const int i01 = i0 * nsh + i1;
441     const double density_01 = (*max_den)(i01) * 4.0;
442 
443     for (int n = 0; n != nneigh_; ++n) {
444       shared_ptr<const Box> neigh = neigh_[n].lock();
445       for (auto& v23 : neigh->sp()) {
446         if (v23->schwarz() < schwarz_thresh_) continue;
447         shared_ptr<const Shell> b2 = v23->shell(1);
448         const int i2 = v23->shell_ind(1);
449         if (i2 < i0) continue;
450         const int b2offset = v23->offset(1);
451         const int b2size = b2->nbasis();
452 
453         shared_ptr<const Shell> b3 = v23->shell(0);
454         const int i3 = v23->shell_ind(0);
455         if (i3 < i2) continue;
456         const int b3offset = v23->offset(0);
457         const int b3size = b3->nbasis();
458 
459         const int i23 = i2 * nsh + i3;
460         if (i23 < i01) continue;
461 
462         const double density_23 = (*max_den)(i23) * 4.0;
463         const double density_02 = (*max_den)(i0 * nsh + i2);
464         const double density_03 = (*max_den)(i0 * nsh + i3);
465         const double density_12 = (*max_den)(i1 * nsh + i2);
466         const double density_13 = (*max_den)(i1 * nsh + i3);
467 
468         const double mulfactor = max(max(max(density_01, density_02),
469                                          max(density_03, density_12)),
470                                          max(density_13, density_23));
471         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
472         const bool skip_schwarz = integral_bound < schwarz_thresh_;
473         if (skip_schwarz) continue;
474 
475         array<shared_ptr<const Shell>,4> input = {{b3, b2, b1, b0}};
476 
477         tasks.emplace_back(
478           [this, &out, &density, input, b0offset, i01, i23, b0size, b1offset, b1size, b2offset, b2size, b3offset, b3size, mulfactor, &jmutex]() {
479 
480             ERIBatch eribatch(input, mulfactor);
481             eribatch.compute();
482             const double* eridata = eribatch.data();
483 
484             const double* density_data = density->data();
485             lock_guard<mutex> lock(jmutex);
486             for (int j0 = b0offset; j0 != b0offset + b0size; ++j0) {
487               const int j0n = j0 * density->ndim();
488               for (int j1 = b1offset; j1 != b1offset + b1size; ++j1) {
489                 if (j1 < j0) {
490                   eridata += b2size * b3size;
491                   continue;
492                 }
493                 const int j1n = j1 * density->ndim();
494                 const unsigned int nj01 = (j0 << shift) + j1;
495                 const double scale01 = (j0 == j1) ? 0.5 : 1.0;
496                 for (int j2 = b2offset; j2 != b2offset + b2size; ++j2) {
497                   const int j2n = j2 * density->ndim();
498                   for (int j3 = b3offset; j3 != b3offset + b3size; ++j3, ++eridata) {
499                     if (j3 < j2) continue;
500                     const unsigned int nj23 = (j2 << shift) + j3;
501                     if (nj23 < nj01 && i01 == i23) continue;
502                     const double scale23 = (j2 == j3) ? 0.5 : 1.0;
503                     const double scale = (nj01 == nj23) ? 0.25 : 0.5;
504 
505                     const double eri = *eridata;
506                     const double intval = eri * scale * scale01 * scale23;
507                     const double intval4 = 4.0 * intval;
508                     out->element(j1, j0) += density_data[j2n + j3] * intval4;
509                     out->element(j3, j2) += density_data[j0n + j1] * intval4;
510                     out->element(max(j2, j0), min(j2,j0)) -= density_data[j1n + j3] * intval;
511                     out->element(j3, j0) -= density_data[j1n + j2] * intval;
512                     out->element(max(j1,j2), min(j1,j2)) -= density_data[j0n + j3] * intval;
513                     out->element(max(j1,j3), min(j1,j3)) -= density_data[j0n + j2] * intval;
514                   }
515                 }
516               }
517             }
518 
519           }
520         );
521 
522       }
523     }
524   }
525 
526   tasks.compute();
527 
528   return out;
529 }
530 
531 
print_box() const532 void Box::print_box() const {
533   cout << "Box " << boxid_ << " Rank = " << rank_ << " *** nchild = " << nchild_ << " *** nsp = " << sp_.size()
534        << " *** nneigh = " << nneigh_ << " *** ninter = " << ninter_ << " *** extent = " << extent_
535        << " *** centre = " << setprecision(3) << centre_[0] << "  " << centre_[1] << "  " << centre_[2] << endl;
536 
537 //  cout << " *** Shell pairs at ***" << endl;
538 //  for (int i = 0; i != sp_.size(); ++i)
539 //    cout << setprecision(5) << sp(i)->centre(0) << "  " << sp(i)->centre(1) << "  " << sp(i)->centre(2) << endl;
540 }
541 
542 
compute_Fock_ff(shared_ptr<const Matrix> density) const543 shared_ptr<const Matrix> Box::compute_Fock_ff(shared_ptr<const Matrix> density) const {
544 
545   assert(nchild_ == 0);
546   auto out = make_shared<ZMatrix>(density->ndim(), density->mdim());
547 
548   // compute multipoles again - not efficient - for now
549   TaskQueue<function<void(void)>> tasks(nsp_);
550   mutex jmutex;
551   for (auto& v : sp_) {
552     if (v->schwarz() < schwarz_thresh_) continue;
553 
554     tasks.emplace_back(
555       [this, &out, &v, &density, &jmutex]() {
556         MultipoleBatch mpole(v->shells(), centre_, lmax_);
557         mpole.compute();
558         const int dimb0 = v->shell(0)->nbasis();
559         const int dimb1 = v->shell(1)->nbasis();
560         vector<const complex<double>*> dat(nmult_);
561         for (int i = 0; i != nmult_; ++i)
562           dat[i] = mpole.data() + mpole.size_block()*i;
563 
564         lock_guard<mutex> lock(jmutex);
565         for (int i = v->offset(1); i != dimb1 + v->offset(1); ++i)
566           for (int j = v->offset(0); j != dimb0 + v->offset(0); ++j)
567             for (int k = 0; k != mpole.num_blocks(); ++k)
568               out->element(i, j) += conj(*dat[k]++) * (*mlm_)(k);
569       }
570     );
571   }
572   tasks.compute();
573   assert(out->get_imag_part()->rms() < 1e-10);
574 
575   return out->get_real_part();
576 }
577 
578 
compute_Fock_ff_K(shared_ptr<const Matrix> ocoeff_ti) const579 shared_ptr<const Matrix> Box::compute_Fock_ff_K(shared_ptr<const Matrix> ocoeff_ti) const {
580 
581   assert(nchild_ == 0);
582   auto out = make_shared<Matrix>(ocoeff_ti->ndim(), olm_ndim_);
583   assert(ocoeff_ti->mdim() == olm_mdim_);
584   const int nmult_k = (lmax_k_+1)*(lmax_k_+1);
585   TaskQueue<function<void(void)>> tasks(nsp_);
586 
587   ZMatrix interm(nshell0_, ocoeff_ti->mdim()*nmult_k, true);
588   vector<mutex> mmutex(shell0_.size());
589 
590   for (auto& v : sp_) {
591     if (v->schwarz() < schwarz_thresh_) continue;
592 
593     tasks.emplace_back(
594       [this, v, ocoeff_ti, &mmutex, nmult_k, &interm]() {
595         MultipoleBatch olm_su(v->shells(), centre_, lmax_k_);
596         olm_su.compute();
597         const int dimb0 = v->shell(0)->nbasis();
598         const int dimb1 = v->shell(1)->nbasis();
599         const ZMatrix zti(*ocoeff_ti->cut(v->offset(1), v->offset(1)+dimb1), 1.0);
600         unique_ptr<complex<double>[]> tmp(new complex<double>[dimb0*ocoeff_ti->mdim()*nmult_k]);
601         for (int k = 0; k != nmult_k; ++k)
602           zgemm_("N", "N", dimb0, ocoeff_ti->mdim(), dimb1, 1.0, olm_su.data() + olm_su.size_block()*k, dimb0, zti.data(), dimb1,
603                                                             0.0, tmp.get()+dimb0*ocoeff_ti->mdim()*k, dimb0);
604         lock_guard<mutex> lock(mmutex[get<2>(shell0_.at(v->shell(0)))]);
605         const int off = get<0>(shell0_.at(v->shell(0)));
606         interm.add_block(1.0, off, 0, dimb0, ocoeff_ti->mdim()*nmult_k, tmp.get());
607       }
608     );
609   }
610   tasks.compute();
611 
612   ZMatrix kjr(olm_ndim_, nshell0_);
613   zgemm_("N", "C", olm_ndim_, nshell0_, olm_mdim_*nmult_k, 1.0, mlm_ji_->data(), olm_ndim_, interm.data(), nshell0_, 0.0, kjr.data(), olm_ndim_);
614 
615   auto krj = kjr.get_real_part()->transpose();
616 
617   for (auto& i : shell0_) {
618     const int dimb0 = i.first->nbasis();
619     const int local_offset = get<0>(i.second);
620     const int all_offset = get<1>(i.second);
621     for (int i = 0; i != olm_ndim_; ++i)
622       blas::ax_plus_y_n(1.0, krj->element_ptr(local_offset, i), dimb0, out->element_ptr(all_offset, i));
623   }
624 
625   return out;
626 }
627 
628 
compute_exact_ff(shared_ptr<const Matrix> density) const629 shared_ptr<const Matrix> Box::compute_exact_ff(shared_ptr<const Matrix> density) const { //for debug
630 
631   auto jff = make_shared<Matrix>(density->ndim(), density->mdim());
632   jff->zero();
633 
634   int ntask = 0;
635   for (auto& v01 : sp_) {
636     if (v01->schwarz() < schwarz_thresh_) continue;
637     for (int n = 0; n != nonneigh_.size(); ++n) {
638       shared_ptr<const Box> nonneigh = neigh_[n].lock();
639       for (auto& v23 : nonneigh->sp()) {
640         if (v23->schwarz() < schwarz_thresh_) continue;
641         ++ntask;
642       }
643     }
644   }
645 
646   TaskQueue<function<void(void)>> tasks(ntask);
647   mutex jmutex;
648 
649   for (auto& v01 : sp_) {
650     if (v01->schwarz() < schwarz_thresh_) continue;
651     shared_ptr<const Shell> b0 = v01->shell(1);
652     const int b0offset = v01->offset(1);
653     const int b0size = b0->nbasis();
654 
655     shared_ptr<const Shell> b1 = v01->shell(0);
656     const int b1offset = v01->offset(0);
657     const int b1size = b1->nbasis();
658 
659     for (int n = 0; n != nonneigh_.size(); ++n) {
660       shared_ptr<const Box> nonneigh = neigh_[n].lock();
661       for (auto& v23 : nonneigh->sp()) {
662         if (v23->schwarz() < schwarz_thresh_) continue;
663         shared_ptr<const Shell> b2 = v23->shell(1);
664         const int b2offset = v23->offset(1);
665         const int b2size = b2->nbasis();
666 
667         shared_ptr<const Shell> b3 = v23->shell(0);
668         const int b3offset = v23->offset(0);
669         const int b3size = b3->nbasis();
670 
671         array<shared_ptr<const Shell>,4> input = {{b3, b2, b1, b0}};
672 
673         tasks.emplace_back(
674           [this, &jff, &density, input, b0offset, b0size, b1offset, b1size, b2offset, b2size, b3offset, b3size, &jmutex]() {
675             const double* density_data = density->data();
676 
677             ERIBatch eribatch(input, 0.0);
678             eribatch.compute();
679             const double* eridata = eribatch.data();
680 
681             lock_guard<mutex> lock(jmutex);
682             for (int j0 = b0offset; j0 != b0offset + b0size; ++j0) {
683               for (int j1 = b1offset; j1 != b1offset + b1size; ++j1) {
684                 const int j1n = j1 * density->ndim();
685                 for (int j2 = b2offset; j2 != b2offset + b2size; ++j2) {
686                   const int j2n = j2 * density->ndim();
687                   for (int j3 = b3offset; j3 != b3offset + b3size; ++j3, ++eridata) {
688                     const double eri = *eridata;
689                     jff->element(j1, j0) += density_data[j2n + j3] * eri;
690                     jff->element(j3, j0) -= density_data[j1n + j2] * eri * 0.5;
691                   }
692                 }
693               }
694             }
695 
696           }
697         );
698 
699       }
700     }
701   }
702   tasks.compute();
703 
704   return jff;
705 }
706 
707 
708 // M2M for X
shift_multipolesX(const int lmax,shared_ptr<const ZMatrix> oa,array<double,3> rab) const709 shared_ptr<const ZMatrix> Box::shift_multipolesX(const int lmax, shared_ptr<const ZMatrix> oa, array<double, 3> rab) const {
710 
711   const double r = sqrt(rab[0]*rab[0] + rab[1]*rab[1] + rab[2]*rab[2]);
712   const double ctheta = (r > numerical_zero__) ? rab[2]/r : 0.0;
713   const double phi = atan2(rab[1], rab[0]);
714   const int nmult = (lmax+1)*(lmax+1);
715   const int olm_size_block = oa->ndim();
716 
717   shared_ptr<ZMatrix> ob = oa->clone();
718   unique_ptr<double[]> plm0(new double[nmult]);
719   for (int l = 0; l != lmax+1; ++l)
720     for (int m = 0; m <= 2 * l; ++m) {
721       const int b = m-l;
722       const double sign = (b >= 0) ? 1.0 : (1-((b&1)<<1));
723       plm0[l*l+m] = sign * plm.compute(l, abs(b), ctheta);
724     }
725 
726   unique_ptr<double[]> invfac(new double[2*lmax+1]);
727   fill_n(invfac.get(), 2*lmax+1, 1.0);
728   for (int i = 1; i <= 2*lmax; ++i)
729     for (int j = i; j <= 2*lmax; ++j)
730       invfac[j] /= i;
731 
732   ZMatrix lmjk(nmult, nmult);
733   for (int l = 0; l <= lmax; ++l) {
734     for (int j = 0; j <= lmax; ++j) {
735       const int a = l - j;
736       const double rr = pow(r, a);
737       if (a < 0) continue;
738       for (int m = 0; m <= 2 * l; ++m) {
739         const int lo = max(-a, m-l-j);
740         const int hi = min( a, m-l+j);
741         for (int b = lo; b <= hi; ++b) {
742           const int k = m - l - b + j;
743           const double prefactor = rr * plm0[a*a+a+b] * invfac[a+abs(b)];
744           const complex<double> Oab = polar(prefactor, -b*phi);
745           lmjk.element(l*l+m, j*j+k) = Oab;
746         }
747       }
748     }
749   }
750   zgemm_("N", "T", olm_size_block, nmult, nmult, 1.0, oa->data(), olm_size_block, lmjk.data(), nmult, 0.0, ob->data(), olm_size_block);
751 
752   return ob;
753 }
754 
755 
756 // L2L for X
shift_localLX(const int lmax,const shared_ptr<const ZMatrix> mr,array<double,3> rb) const757 shared_ptr<const ZMatrix> Box::shift_localLX(const int lmax, const shared_ptr<const ZMatrix> mr, array<double, 3> rb) const {
758 
759   const double r = sqrt(rb[0]*rb[0] + rb[1]*rb[1] + rb[2]*rb[2]);
760   const double ctheta = (r > numerical_zero__) ? rb[2]/r : 0.0;
761   const double phi = atan2(rb[1], rb[0]);
762   const int nmult = (lmax+1)*(lmax+1);
763   const int olm_size_block = mr->ndim();
764 
765   shared_ptr<ZMatrix> mrb  = mr->clone();
766   unique_ptr<double[]> plm0(new double[nmult]);
767   for (int l = 0; l != lmax+1; ++l)
768     for (int m = 0; m <= 2 * l; ++m) {
769       const int b = m-l;
770       const double sign = (b >= 0) ? 1.0 : (1-((b&1)<<1));
771       plm0[l*l+m] = sign * plm.compute(l, abs(b), ctheta);
772     }
773 
774   unique_ptr<double[]> invfac(new double[2*lmax+1]);
775   fill_n(invfac.get(), 2*lmax+1, 1.0);
776   for (int i = 1; i <= 2*lmax; ++i)
777     for (int j = i; j <= 2*lmax; ++j)
778       invfac[j] /= i;
779 
780   ZMatrix lmjk(nmult, nmult);
781   for (int l = 0; l <= lmax; ++l) {
782     for (int j = 0; j <= lmax; ++j) {
783       const int a = j - l;
784       const double rr = pow(r, a);
785       if (a < 0) continue;
786       for (int m = 0; m <= 2 * l; ++m) {
787         const int lo = max(-a, l-m-j);
788         const int hi = min( a, l-m+j);
789         for (int b = lo; b <= hi; ++b) {
790           const int k = m - l + b + j;
791           const double prefactor = rr * plm0[a*a+a+b] * invfac[a+abs(b)];
792           const complex<double> Oab = polar(prefactor, -b*phi);
793           lmjk.element(l*l+m, j*j+k) = Oab;
794         }
795       }
796     }
797   }
798 
799   zgemm_("N", "T", olm_size_block, nmult, nmult, 1.0, mr->data(), olm_size_block, lmjk.data(), nmult, 0.0, mrb->data(), olm_size_block);
800   return mrb;
801 }
802 
803 
804 // M2L for X
shift_localMX(const int lmax,shared_ptr<const ZMatrix> olm,array<double,3> r12) const805 shared_ptr<const ZMatrix> Box::shift_localMX(const int lmax, shared_ptr<const ZMatrix> olm, array<double, 3> r12) const {
806 
807   const double r = sqrt(r12[0]*r12[0] + r12[1]*r12[1] + r12[2]*r12[2]);
808   const double ctheta = (r > numerical_zero__) ? r12[2]/r : 0.0;
809   const double phi = atan2(r12[1], r12[0]);
810   const int nmult = (lmax+1)*(lmax+1);
811   const int olm_size_block = olm->ndim();
812 
813   shared_ptr<ZMatrix> mb = olm->clone();
814   unique_ptr<double[]> plm0(new double[(2*lmax+1)*(2*lmax+1)]);
815   for (int l = 0; l != 2*lmax+1; ++l)
816     for (int m = 0; m <= 2 * l; ++m) {
817       const int b = m-l;
818       const double sign = (b >= 0) ? 1.0 : (1-((b&1)<<1));
819       plm0[l*l+m] = sign * plm.compute(l, abs(b), ctheta);
820     }
821 
822   ZMatrix lmjk(nmult, nmult);
823   for (int l = 0; l <= lmax; ++l) {
824     const double phase_l = (1-((l&1)<<1));
825     for (int j = 0; j <= lmax; ++j) {
826       const int a = l + j;
827       const double rr = 1.0 / pow(r, a + 1);
828       for (int m = 0; m <= 2 * l; ++m) {
829         for (int k = 0; k <= 2 * j; ++k) {
830           const int b = m - l + k - j;
831           double prefactor = plm0[a*a+a+b] * rr * f(a-abs(b));
832           const complex<double> Mab = phase_l * polar(prefactor, b*phi);
833           lmjk.element(l*l+m, j*j+k) = Mab;
834         }
835       }
836     }
837   }
838 
839   zgemm_("N", "T", olm_size_block, nmult, nmult, 1.0, olm->data(), olm_size_block, lmjk.data(), nmult, 0.0, mb->data(), olm_size_block);
840   return mb;
841 }
842 
843 
compute_Fock_nf_J(shared_ptr<const Matrix> density,shared_ptr<const VectorB> max_den) const844 shared_ptr<const Matrix> Box::compute_Fock_nf_J(shared_ptr<const Matrix> density, shared_ptr<const VectorB> max_den) const {
845 
846   assert(nchild_ == 0);
847   auto out = make_shared<Matrix>(density->ndim(), density->mdim());
848   out->zero();
849 
850   // NF: 4c integrals
851   const int shift = sizeof(int) * 4;
852   const int nsh = sqrt(max_den->size());
853   assert (nsh*nsh == max_den->size());
854 
855   int ntask = 0;
856   for (auto& v01 : sp_) {
857     if (v01->schwarz() < schwarz_thresh_) continue;
858     const int i0 = v01->shell_ind(1);
859 
860     const int i1 = v01->shell_ind(0);
861     if (i1 < i0) continue;
862 
863     const int i01 = i0 * nsh + i1;
864     const double density_01 = (*max_den)(i01) * 4.0;
865 
866     for (int n = 0; n != nneigh_; ++n) {
867       shared_ptr<const Box> neigh = neigh_[n].lock();
868       for (auto& v23 : neigh->sp()) {
869         if (v23->schwarz() < schwarz_thresh_) continue;
870         const int i2 = v23->shell_ind(1);
871         if (i2 < i0) continue;
872         const int i3 = v23->shell_ind(0);
873         if (i3 < i2) continue;
874         const int i23 = i2 * nsh + i3;
875         if (i23 < i01) continue;
876 
877         const double density_23 = (*max_den)(i23) * 4.0;
878 
879         const double mulfactor = max(density_01, density_23);
880         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
881         const bool skip_schwarz = integral_bound < schwarz_thresh_;
882         if (skip_schwarz) continue;
883         ++ntask;
884       }
885     }
886   }
887 
888   TaskQueue<function<void(void)>> tasks(ntask);
889   mutex jmutex;
890 
891   for (auto& v01 : sp_) {
892     if (v01->schwarz() < schwarz_thresh_) continue;
893     shared_ptr<const Shell> b0 = v01->shell(1);
894     const int i0 = v01->shell_ind(1);
895     const int b0offset = v01->offset(1);
896     const int b0size = b0->nbasis();
897 
898     shared_ptr<const Shell> b1 = v01->shell(0);
899     const int i1 = v01->shell_ind(0);
900     if (i1 < i0) continue;
901     const int b1offset = v01->offset(0);
902     const int b1size = b1->nbasis();
903 
904     const int i01 = i0 * nsh + i1;
905     const double density_01 = (*max_den)(i01) * 4.0;
906 
907     for (int n = 0; n != nneigh_; ++n) {
908       shared_ptr<const Box> neigh = neigh_[n].lock();
909       for (auto& v23 : neigh->sp()) {
910         if (v23->schwarz() < schwarz_thresh_) continue;
911         shared_ptr<const Shell> b2 = v23->shell(1);
912         const int i2 = v23->shell_ind(1);
913         if (i2 < i0) continue;
914         const int b2offset = v23->offset(1);
915         const int b2size = b2->nbasis();
916 
917         shared_ptr<const Shell> b3 = v23->shell(0);
918         const int i3 = v23->shell_ind(0);
919         if (i3 < i2) continue;
920         const int b3offset = v23->offset(0);
921         const int b3size = b3->nbasis();
922 
923         const int i23 = i2 * nsh + i3;
924         if (i23 < i01) continue;
925 
926         const double density_23 = (*max_den)(i23) * 4.0;
927 
928         const double mulfactor = max(density_01, density_23);
929         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
930         const bool skip_schwarz = integral_bound < schwarz_thresh_;
931         if (skip_schwarz) continue;
932 
933         array<shared_ptr<const Shell>,4> input = {{b3, b2, b1, b0}};
934 
935         tasks.emplace_back(
936           [this, &out, &density, input, b0offset, i01, i23, b0size, b1offset, b1size, b2offset, b2size, b3offset, b3size, mulfactor, &jmutex]() {
937 
938             ERIBatch eribatch(input, mulfactor);
939             eribatch.compute();
940             const double* eridata = eribatch.data();
941 
942             const double* density_data = density->data();
943             lock_guard<mutex> lock(jmutex);
944             for (int j0 = b0offset; j0 != b0offset + b0size; ++j0) {
945               const int j0n = j0 * density->ndim();
946               for (int j1 = b1offset; j1 != b1offset + b1size; ++j1) {
947                 if (j1 < j0) {
948                   eridata += b2size * b3size;
949                   continue;
950                 }
951                 const unsigned int nj01 = (j0 << shift) + j1;
952                 const double scale01 = (j0 == j1) ? 0.5 : 1.0;
953                 for (int j2 = b2offset; j2 != b2offset + b2size; ++j2) {
954                   const int j2n = j2 * density->ndim();
955                   for (int j3 = b3offset; j3 != b3offset + b3size; ++j3, ++eridata) {
956                     if (j3 < j2) continue;
957                     const unsigned int nj23 = (j2 << shift) + j3;
958                     if (nj23 < nj01 && i01 == i23) continue;
959                     const double scale23 = (j2 == j3) ? 0.5 : 1.0;
960                     const double scale = (nj01 == nj23) ? 0.25 : 0.5;
961 
962                     const double eri = *eridata;
963                     const double intval = eri * scale * scale01 * scale23;
964                     const double intval4 = 4.0 * intval;
965                     out->element(j1, j0) += density_data[j2n + j3] * intval4;
966                     out->element(j3, j2) += density_data[j0n + j1] * intval4;
967                   }
968                 }
969               }
970             }
971 
972           }
973         );
974 
975       }
976     }
977   }
978 
979   tasks.compute();
980 
981   return out;
982 }
983 
984 
compute_Fock_nf_K(shared_ptr<const Matrix> density,shared_ptr<const VectorB> max_den) const985 shared_ptr<const Matrix> Box::compute_Fock_nf_K(shared_ptr<const Matrix> density, shared_ptr<const VectorB> max_den) const {
986 
987   assert(nchild_ == 0);
988   auto out = make_shared<Matrix>(density->ndim(), density->mdim());
989   out->zero();
990 
991   // NF: 4c integrals
992   const int shift = sizeof(int) * 4;
993   const int nsh = sqrt(max_den->size());
994   assert (nsh*nsh == max_den->size());
995 
996   int ntask = 0;
997   for (auto& v01 : sp_) {
998     if (v01->schwarz() < schwarz_thresh_) continue;
999     const int i0 = v01->shell_ind(1);
1000 
1001     const int i1 = v01->shell_ind(0);
1002     if (i1 < i0) continue;
1003 
1004     const int i01 = i0 * nsh + i1;
1005     for (int n = 0; n != nneigh_; ++n) {
1006       shared_ptr<const Box> neigh = neigh_[n].lock();
1007       for (auto& v23 : neigh->sp()) {
1008         if (v23->schwarz() < schwarz_thresh_) continue;
1009         const int i2 = v23->shell_ind(1);
1010         if (i2 < i0) continue;
1011         const int i3 = v23->shell_ind(0);
1012         if (i3 < i2) continue;
1013         const int i23 = i2 * nsh + i3;
1014         if (i23 < i01) continue;
1015 
1016         const double density_02 = (*max_den)(i0 * nsh + i2);
1017         const double density_03 = (*max_den)(i0 * nsh + i3);
1018         const double density_12 = (*max_den)(i1 * nsh + i2);
1019         const double density_13 = (*max_den)(i1 * nsh + i3);
1020 
1021         const double mulfactor = max(max(density_13, density_02),
1022                                      max(density_03, density_12));
1023         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
1024         const bool skip_schwarz = integral_bound < schwarz_thresh_;
1025         if (skip_schwarz) continue;
1026         ++ntask;
1027       }
1028     }
1029   }
1030 
1031   TaskQueue<function<void(void)>> tasks(ntask);
1032   mutex jmutex;
1033 
1034   for (auto& v01 : sp_) {
1035     if (v01->schwarz() < schwarz_thresh_) continue;
1036     shared_ptr<const Shell> b0 = v01->shell(1);
1037     const int i0 = v01->shell_ind(1);
1038     const int b0offset = v01->offset(1);
1039     const int b0size = b0->nbasis();
1040 
1041     shared_ptr<const Shell> b1 = v01->shell(0);
1042     const int i1 = v01->shell_ind(0);
1043     if (i1 < i0) continue;
1044     const int b1offset = v01->offset(0);
1045     const int b1size = b1->nbasis();
1046 
1047     const int i01 = i0 * nsh + i1;
1048     for (int n = 0; n != nneigh_; ++n) {
1049       shared_ptr<const Box> neigh = neigh_[n].lock();
1050       for (auto& v23 : neigh->sp()) {
1051         if (v23->schwarz() < schwarz_thresh_) continue;
1052         shared_ptr<const Shell> b2 = v23->shell(1);
1053         const int i2 = v23->shell_ind(1);
1054         if (i2 < i0) continue;
1055         const int b2offset = v23->offset(1);
1056         const int b2size = b2->nbasis();
1057 
1058         shared_ptr<const Shell> b3 = v23->shell(0);
1059         const int i3 = v23->shell_ind(0);
1060         if (i3 < i2) continue;
1061         const int b3offset = v23->offset(0);
1062         const int b3size = b3->nbasis();
1063 
1064         const int i23 = i2 * nsh + i3;
1065         if (i23 < i01) continue;
1066 
1067         const double density_02 = (*max_den)(i0 * nsh + i2);
1068         const double density_03 = (*max_den)(i0 * nsh + i3);
1069         const double density_12 = (*max_den)(i1 * nsh + i2);
1070         const double density_13 = (*max_den)(i1 * nsh + i3);
1071 
1072         const double mulfactor = max(max(density_13, density_02),
1073                                      max(density_03, density_12));
1074         const double integral_bound = mulfactor * v01->schwarz() * v23->schwarz();
1075         const bool skip_schwarz = integral_bound < schwarz_thresh_;
1076         if (skip_schwarz) continue;
1077 
1078         array<shared_ptr<const Shell>,4> input = {{b3, b2, b1, b0}};
1079 
1080         tasks.emplace_back(
1081           [this, &out, &density, input, b0offset, i01, i23, b0size, b1offset, b1size, b2offset, b2size, b3offset, b3size, mulfactor, &jmutex]() {
1082 
1083             ERIBatch eribatch(input, mulfactor);
1084             eribatch.compute();
1085             const double* eridata = eribatch.data();
1086 
1087             const double* density_data = density->data();
1088             lock_guard<mutex> lock(jmutex);
1089             for (int j0 = b0offset; j0 != b0offset + b0size; ++j0) {
1090               const int j0n = j0 * density->ndim();
1091               for (int j1 = b1offset; j1 != b1offset + b1size; ++j1) {
1092                 if (j1 < j0) {
1093                   eridata += b2size * b3size;
1094                   continue;
1095                 }
1096                 const int j1n = j1 * density->ndim();
1097                 const unsigned int nj01 = (j0 << shift) + j1;
1098                 const double scale01 = (j0 == j1) ? 0.5 : 1.0;
1099                 for (int j2 = b2offset; j2 != b2offset + b2size; ++j2) {
1100                   for (int j3 = b3offset; j3 != b3offset + b3size; ++j3, ++eridata) {
1101                     if (j3 < j2) continue;
1102                     const unsigned int nj23 = (j2 << shift) + j3;
1103                     if (nj23 < nj01 && i01 == i23) continue;
1104                     const double scale23 = (j2 == j3) ? 0.5 : 1.0;
1105                     const double scale = (nj01 == nj23) ? 0.25 : 0.5;
1106 
1107                     const double eri = *eridata;
1108                     const double intval = eri * scale * scale01 * scale23;
1109                     out->element(max(j2, j0), min(j2,j0)) -= density_data[j1n + j3] * intval;
1110                     out->element(j3, j0) -= density_data[j1n + j2] * intval;
1111                     out->element(max(j1,j2), min(j1,j2)) -= density_data[j0n + j3] * intval;
1112                     out->element(max(j1,j3), min(j1,j3)) -= density_data[j0n + j2] * intval;
1113                   }
1114                 }
1115               }
1116             }
1117 
1118           }
1119         );
1120 
1121       }
1122     }
1123   }
1124 
1125   tasks.compute();
1126 
1127   return out;
1128 }
1129