1 //
2 // BAGEL - Brilliantly Advanced General Electronic Structure Library
3 // Filename: gocompute.cc
4 // Copyright (C) 2012 Toru Shiozaki
5 //
6 // Author: Toru Shiozaki <shiozaki@northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // This program is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU General Public License
22 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 //
24 
25 
26 #include <src/integral/carsphlist.h>
27 #include <src/integral/os/goverlapbatch.h>
28 #include <src/util/math/comb.h>
29 
30 using namespace std;
31 using namespace bagel;
32 
33 const static Comb comb;
34 const static CarSphList carsphlist;
35 
compute()36 void GOverlapBatch::compute() {
37 
38   fill_n(data_, size_alloc_, 0.0);
39 
40   const int a = basisinfo_[0]->angular_number();
41   const int b = basisinfo_[1]->angular_number();
42   const int a2 = a+2;
43   const int b2 = b+2;
44   assert(amax_ == a+b+1);
45   const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
46 
47   double* const transx = stack_->get((amax_+1)*a2*b2);
48   double* const transy = stack_->get((amax_+1)*a2*b2);
49   double* const transz = stack_->get((amax_+1)*a2*b2);
50   fill_n(transx, (amax_+1)*a2*b2, 0.0);
51   fill_n(transy, (amax_+1)*a2*b2, 0.0);
52   fill_n(transz, (amax_+1)*a2*b2, 0.0);
53   for (int ib = 0, k = 0; ib <= b+1; ++ib) {
54     for (int ia = 0; ia <= a+1; ++ia, ++k) {
55       if (ia == a+1 && ib == b+1) continue;
56       for (int i = ia; i <= ia+ib; ++i) {
57         transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
58         transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
59         transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
60       }
61     }
62   }
63   const int worksize = amax_+1;
64   double* const workx = stack_->get(worksize);
65   double* const worky = stack_->get(worksize);
66   double* const workz = stack_->get(worksize);
67 
68   double* const bufx = stack_->get(a2*b2);
69   double* const bufy = stack_->get(a2*b2);
70   double* const bufz = stack_->get(a2*b2);
71   double* const bufx_a = stack_->get(a2*b2);
72   double* const bufy_a = stack_->get(a2*b2);
73   double* const bufz_a = stack_->get(a2*b2);
74 
75   // Perform VRR
76   for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
77 
78     /// Sx(0 : i+j+1, 0) will be made here
79     workx[0] = coeffsx_[ii];
80     worky[0] = coeffsy_[ii];
81     workz[0] = coeffsz_[ii];
82     workx[1] = (P_[ii * 3    ] - basisinfo_[0]->position(0)) * workx[0];
83     worky[1] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[0];
84     workz[1] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[0];
85     for (int i = 2; i <= amax_; ++i) {
86       workx[i] = (P_[ii * 3    ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
87       worky[i] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
88       workz[i] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
89     }
90     // HRR is done in one shot
91     dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
92     dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
93     dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
94 
95     const double alpha = xa_[ii];
96     for (int ib = 0; ib <= b; ++ib) {
97       for (int ia = 0; ia <= a; ++ia) {
98         bufx_a[ia+a2*ib] = 2.0*alpha*bufx[ia+1+a2*(ib)] - (ia != 0 ? ia*bufx[ia-1+a2*(ib)] : 0);
99         bufy_a[ia+a2*ib] = 2.0*alpha*bufy[ia+1+a2*(ib)] - (ia != 0 ? ia*bufy[ia-1+a2*(ib)] : 0);
100         bufz_a[ia+a2*ib] = 2.0*alpha*bufz[ia+1+a2*(ib)] - (ia != 0 ? ia*bufz[ia-1+a2*(ib)] : 0);
101       }
102     }
103 
104     /// assembly process
105     const int offset_ii = ii * acsize;
106     double* current_data0 = data_ + offset_ii;
107     double* current_data1 = data_ + offset_ii + size_block_;
108     double* current_data2 = data_ + offset_ii + size_block_*2;
109 
110     for (int iaz = 0; iaz <= a; ++iaz) {
111       for (int iay = 0; iay <= a - iaz; ++iay) {
112         const int iax = a - iaz - iay;
113         for (int ibz = 0; ibz <= b; ++ibz) {
114           for (int iby = 0; iby <= b - ibz; ++iby) {
115             const int ibx = b - ibz - iby;
116             *current_data0++ += bufx_a[iax+a2*ibx] * bufy  [iay+a2*iby] * bufz  [iaz+a2*ibz];
117             *current_data1++ += bufx  [iax+a2*ibx] * bufy_a[iay+a2*iby] * bufz  [iaz+a2*ibz];
118             *current_data2++ += bufx  [iax+a2*ibx] * bufy  [iay+a2*iby] * bufz_a[iaz+a2*ibz];
119           }
120         }
121       }
122     }
123 
124   } // end of primsize loop
125 
126   stack_->release(a2*b2, bufz_a);
127   stack_->release(a2*b2, bufy_a);
128   stack_->release(a2*b2, bufx_a);
129   stack_->release(a2*b2, bufz);
130   stack_->release(a2*b2, bufy);
131   stack_->release(a2*b2, bufx);
132 
133   stack_->release(worksize, workz);
134   stack_->release(worksize, worky);
135   stack_->release(worksize, workx);
136 
137   stack_->release((amax_+1)*a2*b2, transz);
138   stack_->release((amax_+1)*a2*b2, transy);
139   stack_->release((amax_+1)*a2*b2, transx);
140 
141   const size_t acpsize = acsize*cont0_*cont1_;
142   double* const bkup = stack_->get(acpsize);
143   double* cdata = data_;
144   const SortList sort(spherical_);
145   for (int i = 0; i != 3; ++i, cdata += size_block_) {
146     // first, contraction.
147     const double* source = cdata;
148     double* target = bkup;
149     perform_contraction(acsize, source, prim0_, prim1_, target,
150                         basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
151                         basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
152 
153     if (spherical_) {
154       const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
155       const int nloops = cont0_ * cont1_;
156       source = bkup;
157       target = cdata;
158       carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
159 
160       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
161       source = cdata;
162       target = bkup;
163       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
164       copy_n(bkup, acpsize, cdata);
165     } else {
166       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
167       source = bkup;
168       target = cdata;
169       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
170     }
171   }
172 
173   for (int i = 0; i != 3; ++i)
174     blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
175 
176   stack_->release(acpsize, bkup);
177 
178 }
179 
compute()180 void GDerivOverBatch::compute() {
181   fill_n(data_, size_alloc_, 0.0);
182 
183   const int a = basisinfo_[0]->angular_number();
184   const int b = basisinfo_[1]->angular_number();
185   const int a2 = a+2;
186   const int b2 = b+2;
187   assert(amax_ == a+b+1);
188   const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
189 
190   double* const transx = stack_->get((amax_+1)*a2*b2);
191   double* const transy = stack_->get((amax_+1)*a2*b2);
192   double* const transz = stack_->get((amax_+1)*a2*b2);
193   fill_n(transx, (amax_+1)*a2*b2, 0.0);
194   fill_n(transy, (amax_+1)*a2*b2, 0.0);
195   fill_n(transz, (amax_+1)*a2*b2, 0.0);
196   for (int ib = 0, k = 0; ib <= b+1; ++ib) {
197     for (int ia = 0; ia <= a+1; ++ia, ++k) {
198       if (ia == a+1 && ib == b+1) continue;
199       for (int i = ia; i <= ia+ib; ++i) {
200         transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
201         transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
202         transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
203       }
204     }
205   }
206   const int worksize = amax_+1;
207   double* const workx = stack_->get(worksize);
208   double* const worky = stack_->get(worksize);
209   double* const workz = stack_->get(worksize);
210 
211   double* const bufx = stack_->get(a2*b2);
212   double* const bufy = stack_->get(a2*b2);
213   double* const bufz = stack_->get(a2*b2);
214   double* const bufx_a = stack_->get(a2*b2);
215   double* const bufy_a = stack_->get(a2*b2);
216   double* const bufz_a = stack_->get(a2*b2);
217 
218   for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
219     workx[0] = coeffsx_[ii];
220     worky[0] = coeffsy_[ii];
221     workz[0] = coeffsz_[ii];
222     workx[1] = (P_[ii * 3    ] - basisinfo_[0]->position(0)) * workx[0];
223     worky[1] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[0];
224     workz[1] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[0];
225     for (int i = 2; i <= amax_; ++i) {
226       workx[i] = (P_[ii * 3    ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
227       worky[i] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
228       workz[i] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
229     }
230 
231     dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
232     dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
233     dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
234 
235     const double alpha = xa_[ii];
236     const double beta = xb_[ii];
237 
238     if (!swap01_) {
239       for (int ib = 0; ib <= b; ++ib) {
240         for (int ia = 0; ia <= a; ++ia) {
241           bufx_a[ia+a2*ib] = 2.0*beta*bufx[ia+a2*(ib+1)] - (ib != 0 ? ib*bufx[ia+a2*(ib-1)] : 0);
242           bufy_a[ia+a2*ib] = 2.0*beta*bufy[ia+a2*(ib+1)] - (ib != 0 ? ib*bufy[ia+a2*(ib-1)] : 0);
243           bufz_a[ia+a2*ib] = 2.0*beta*bufz[ia+a2*(ib+1)] - (ib != 0 ? ib*bufz[ia+a2*(ib-1)] : 0);
244         }
245       }
246     } else {
247       for (int ib = 0; ib <= b; ++ib) {
248         for (int ia = 0; ia <= a; ++ia) {
249           bufx_a[ia+a2*ib] = 2.0*alpha*bufx[ia+1+a2*(ib)] - (ia != 0 ? ia*bufx[ia-1+a2*(ib)] : 0);
250           bufy_a[ia+a2*ib] = 2.0*alpha*bufy[ia+1+a2*(ib)] - (ia != 0 ? ia*bufy[ia-1+a2*(ib)] : 0);
251           bufz_a[ia+a2*ib] = 2.0*alpha*bufz[ia+1+a2*(ib)] - (ia != 0 ? ia*bufz[ia-1+a2*(ib)] : 0);
252         }
253       }
254     }
255 
256     const int offset_ii = ii * acsize;
257     double* current_data0 = data_ + offset_ii;
258     double* current_data1 = data_ + offset_ii + size_block_;
259     double* current_data2 = data_ + offset_ii + size_block_*2;
260 
261     for (int iaz = 0; iaz <= a; ++iaz) {
262       for (int iay = 0; iay <= a - iaz; ++iay) {
263         const int iax = a - iaz - iay;
264         for (int ibz = 0; ibz <= b; ++ibz) {
265           for (int iby = 0; iby <= b - ibz; ++iby) {
266             const int ibx = b - ibz - iby;
267             *current_data0++ += bufx_a[iax+a2*ibx] * bufy  [iay+a2*iby] * bufz  [iaz+a2*ibz];
268             *current_data1++ += bufx  [iax+a2*ibx] * bufy_a[iay+a2*iby] * bufz  [iaz+a2*ibz];
269             *current_data2++ += bufx  [iax+a2*ibx] * bufy  [iay+a2*iby] * bufz_a[iaz+a2*ibz];
270           }
271         }
272       }
273     }
274   } // end of primsize loop
275 
276   stack_->release(a2*b2, bufz_a);
277   stack_->release(a2*b2, bufy_a);
278   stack_->release(a2*b2, bufx_a);
279   stack_->release(a2*b2, bufz);
280   stack_->release(a2*b2, bufy);
281   stack_->release(a2*b2, bufx);
282 
283   stack_->release(worksize, workz);
284   stack_->release(worksize, worky);
285   stack_->release(worksize, workx);
286 
287   stack_->release((amax_+1)*a2*b2, transz);
288   stack_->release((amax_+1)*a2*b2, transy);
289   stack_->release((amax_+1)*a2*b2, transx);
290 
291   const size_t acpsize = acsize*cont0_*cont1_;
292   double* const bkup = stack_->get(acpsize);
293   double* cdata = data_;
294   const SortList sort(spherical_);
295   for (int i = 0; i != 3; ++i, cdata += size_block_) {
296     const double* source = cdata;
297     double* target = bkup;
298     perform_contraction(acsize, source, prim0_, prim1_, target,
299                         basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
300                         basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
301 
302     if (spherical_) {
303       const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
304       const int nloops = cont0_ * cont1_;
305       source = bkup;
306       target = cdata;
307       carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
308 
309       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
310       source = cdata;
311       target = bkup;
312       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
313       copy_n(bkup, acpsize, cdata);
314     } else {
315       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
316       source = bkup;
317       target = cdata;
318       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
319     }
320   }
321 
322   if (!swap01_)
323     for (int i = 0; i != 3; ++i) {
324       blas::ax_plus_y_n(1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
325       blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+i*size_block_);
326     }
327 
328   stack_->release(acpsize, bkup);
329 }
330