1 //
2 // BAGEL - Brilliantly Advanced General Electronic Structure Library
3 // Filename: gocompute.cc
4 // Copyright (C) 2012 Toru Shiozaki
5 //
6 // Author: Toru Shiozaki <shiozaki@northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // This program is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU General Public License
22 // along with this program. If not, see <http://www.gnu.org/licenses/>.
23 //
24
25
26 #include <src/integral/carsphlist.h>
27 #include <src/integral/os/goverlapbatch.h>
28 #include <src/util/math/comb.h>
29
30 using namespace std;
31 using namespace bagel;
32
33 const static Comb comb;
34 const static CarSphList carsphlist;
35
compute()36 void GOverlapBatch::compute() {
37
38 fill_n(data_, size_alloc_, 0.0);
39
40 const int a = basisinfo_[0]->angular_number();
41 const int b = basisinfo_[1]->angular_number();
42 const int a2 = a+2;
43 const int b2 = b+2;
44 assert(amax_ == a+b+1);
45 const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
46
47 double* const transx = stack_->get((amax_+1)*a2*b2);
48 double* const transy = stack_->get((amax_+1)*a2*b2);
49 double* const transz = stack_->get((amax_+1)*a2*b2);
50 fill_n(transx, (amax_+1)*a2*b2, 0.0);
51 fill_n(transy, (amax_+1)*a2*b2, 0.0);
52 fill_n(transz, (amax_+1)*a2*b2, 0.0);
53 for (int ib = 0, k = 0; ib <= b+1; ++ib) {
54 for (int ia = 0; ia <= a+1; ++ia, ++k) {
55 if (ia == a+1 && ib == b+1) continue;
56 for (int i = ia; i <= ia+ib; ++i) {
57 transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
58 transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
59 transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
60 }
61 }
62 }
63 const int worksize = amax_+1;
64 double* const workx = stack_->get(worksize);
65 double* const worky = stack_->get(worksize);
66 double* const workz = stack_->get(worksize);
67
68 double* const bufx = stack_->get(a2*b2);
69 double* const bufy = stack_->get(a2*b2);
70 double* const bufz = stack_->get(a2*b2);
71 double* const bufx_a = stack_->get(a2*b2);
72 double* const bufy_a = stack_->get(a2*b2);
73 double* const bufz_a = stack_->get(a2*b2);
74
75 // Perform VRR
76 for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
77
78 /// Sx(0 : i+j+1, 0) will be made here
79 workx[0] = coeffsx_[ii];
80 worky[0] = coeffsy_[ii];
81 workz[0] = coeffsz_[ii];
82 workx[1] = (P_[ii * 3 ] - basisinfo_[0]->position(0)) * workx[0];
83 worky[1] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[0];
84 workz[1] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[0];
85 for (int i = 2; i <= amax_; ++i) {
86 workx[i] = (P_[ii * 3 ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
87 worky[i] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
88 workz[i] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
89 }
90 // HRR is done in one shot
91 dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
92 dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
93 dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
94
95 const double alpha = xa_[ii];
96 for (int ib = 0; ib <= b; ++ib) {
97 for (int ia = 0; ia <= a; ++ia) {
98 bufx_a[ia+a2*ib] = 2.0*alpha*bufx[ia+1+a2*(ib)] - (ia != 0 ? ia*bufx[ia-1+a2*(ib)] : 0);
99 bufy_a[ia+a2*ib] = 2.0*alpha*bufy[ia+1+a2*(ib)] - (ia != 0 ? ia*bufy[ia-1+a2*(ib)] : 0);
100 bufz_a[ia+a2*ib] = 2.0*alpha*bufz[ia+1+a2*(ib)] - (ia != 0 ? ia*bufz[ia-1+a2*(ib)] : 0);
101 }
102 }
103
104 /// assembly process
105 const int offset_ii = ii * acsize;
106 double* current_data0 = data_ + offset_ii;
107 double* current_data1 = data_ + offset_ii + size_block_;
108 double* current_data2 = data_ + offset_ii + size_block_*2;
109
110 for (int iaz = 0; iaz <= a; ++iaz) {
111 for (int iay = 0; iay <= a - iaz; ++iay) {
112 const int iax = a - iaz - iay;
113 for (int ibz = 0; ibz <= b; ++ibz) {
114 for (int iby = 0; iby <= b - ibz; ++iby) {
115 const int ibx = b - ibz - iby;
116 *current_data0++ += bufx_a[iax+a2*ibx] * bufy [iay+a2*iby] * bufz [iaz+a2*ibz];
117 *current_data1++ += bufx [iax+a2*ibx] * bufy_a[iay+a2*iby] * bufz [iaz+a2*ibz];
118 *current_data2++ += bufx [iax+a2*ibx] * bufy [iay+a2*iby] * bufz_a[iaz+a2*ibz];
119 }
120 }
121 }
122 }
123
124 } // end of primsize loop
125
126 stack_->release(a2*b2, bufz_a);
127 stack_->release(a2*b2, bufy_a);
128 stack_->release(a2*b2, bufx_a);
129 stack_->release(a2*b2, bufz);
130 stack_->release(a2*b2, bufy);
131 stack_->release(a2*b2, bufx);
132
133 stack_->release(worksize, workz);
134 stack_->release(worksize, worky);
135 stack_->release(worksize, workx);
136
137 stack_->release((amax_+1)*a2*b2, transz);
138 stack_->release((amax_+1)*a2*b2, transy);
139 stack_->release((amax_+1)*a2*b2, transx);
140
141 const size_t acpsize = acsize*cont0_*cont1_;
142 double* const bkup = stack_->get(acpsize);
143 double* cdata = data_;
144 const SortList sort(spherical_);
145 for (int i = 0; i != 3; ++i, cdata += size_block_) {
146 // first, contraction.
147 const double* source = cdata;
148 double* target = bkup;
149 perform_contraction(acsize, source, prim0_, prim1_, target,
150 basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
151 basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
152
153 if (spherical_) {
154 const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
155 const int nloops = cont0_ * cont1_;
156 source = bkup;
157 target = cdata;
158 carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
159
160 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
161 source = cdata;
162 target = bkup;
163 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
164 copy_n(bkup, acpsize, cdata);
165 } else {
166 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
167 source = bkup;
168 target = cdata;
169 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
170 }
171 }
172
173 for (int i = 0; i != 3; ++i)
174 blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
175
176 stack_->release(acpsize, bkup);
177
178 }
179
compute()180 void GDerivOverBatch::compute() {
181 fill_n(data_, size_alloc_, 0.0);
182
183 const int a = basisinfo_[0]->angular_number();
184 const int b = basisinfo_[1]->angular_number();
185 const int a2 = a+2;
186 const int b2 = b+2;
187 assert(amax_ == a+b+1);
188 const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
189
190 double* const transx = stack_->get((amax_+1)*a2*b2);
191 double* const transy = stack_->get((amax_+1)*a2*b2);
192 double* const transz = stack_->get((amax_+1)*a2*b2);
193 fill_n(transx, (amax_+1)*a2*b2, 0.0);
194 fill_n(transy, (amax_+1)*a2*b2, 0.0);
195 fill_n(transz, (amax_+1)*a2*b2, 0.0);
196 for (int ib = 0, k = 0; ib <= b+1; ++ib) {
197 for (int ia = 0; ia <= a+1; ++ia, ++k) {
198 if (ia == a+1 && ib == b+1) continue;
199 for (int i = ia; i <= ia+ib; ++i) {
200 transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
201 transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
202 transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
203 }
204 }
205 }
206 const int worksize = amax_+1;
207 double* const workx = stack_->get(worksize);
208 double* const worky = stack_->get(worksize);
209 double* const workz = stack_->get(worksize);
210
211 double* const bufx = stack_->get(a2*b2);
212 double* const bufy = stack_->get(a2*b2);
213 double* const bufz = stack_->get(a2*b2);
214 double* const bufx_a = stack_->get(a2*b2);
215 double* const bufy_a = stack_->get(a2*b2);
216 double* const bufz_a = stack_->get(a2*b2);
217
218 for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
219 workx[0] = coeffsx_[ii];
220 worky[0] = coeffsy_[ii];
221 workz[0] = coeffsz_[ii];
222 workx[1] = (P_[ii * 3 ] - basisinfo_[0]->position(0)) * workx[0];
223 worky[1] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[0];
224 workz[1] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[0];
225 for (int i = 2; i <= amax_; ++i) {
226 workx[i] = (P_[ii * 3 ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
227 worky[i] = (P_[ii * 3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
228 workz[i] = (P_[ii * 3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
229 }
230
231 dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
232 dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
233 dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
234
235 const double alpha = xa_[ii];
236 const double beta = xb_[ii];
237
238 if (!swap01_) {
239 for (int ib = 0; ib <= b; ++ib) {
240 for (int ia = 0; ia <= a; ++ia) {
241 bufx_a[ia+a2*ib] = 2.0*beta*bufx[ia+a2*(ib+1)] - (ib != 0 ? ib*bufx[ia+a2*(ib-1)] : 0);
242 bufy_a[ia+a2*ib] = 2.0*beta*bufy[ia+a2*(ib+1)] - (ib != 0 ? ib*bufy[ia+a2*(ib-1)] : 0);
243 bufz_a[ia+a2*ib] = 2.0*beta*bufz[ia+a2*(ib+1)] - (ib != 0 ? ib*bufz[ia+a2*(ib-1)] : 0);
244 }
245 }
246 } else {
247 for (int ib = 0; ib <= b; ++ib) {
248 for (int ia = 0; ia <= a; ++ia) {
249 bufx_a[ia+a2*ib] = 2.0*alpha*bufx[ia+1+a2*(ib)] - (ia != 0 ? ia*bufx[ia-1+a2*(ib)] : 0);
250 bufy_a[ia+a2*ib] = 2.0*alpha*bufy[ia+1+a2*(ib)] - (ia != 0 ? ia*bufy[ia-1+a2*(ib)] : 0);
251 bufz_a[ia+a2*ib] = 2.0*alpha*bufz[ia+1+a2*(ib)] - (ia != 0 ? ia*bufz[ia-1+a2*(ib)] : 0);
252 }
253 }
254 }
255
256 const int offset_ii = ii * acsize;
257 double* current_data0 = data_ + offset_ii;
258 double* current_data1 = data_ + offset_ii + size_block_;
259 double* current_data2 = data_ + offset_ii + size_block_*2;
260
261 for (int iaz = 0; iaz <= a; ++iaz) {
262 for (int iay = 0; iay <= a - iaz; ++iay) {
263 const int iax = a - iaz - iay;
264 for (int ibz = 0; ibz <= b; ++ibz) {
265 for (int iby = 0; iby <= b - ibz; ++iby) {
266 const int ibx = b - ibz - iby;
267 *current_data0++ += bufx_a[iax+a2*ibx] * bufy [iay+a2*iby] * bufz [iaz+a2*ibz];
268 *current_data1++ += bufx [iax+a2*ibx] * bufy_a[iay+a2*iby] * bufz [iaz+a2*ibz];
269 *current_data2++ += bufx [iax+a2*ibx] * bufy [iay+a2*iby] * bufz_a[iaz+a2*ibz];
270 }
271 }
272 }
273 }
274 } // end of primsize loop
275
276 stack_->release(a2*b2, bufz_a);
277 stack_->release(a2*b2, bufy_a);
278 stack_->release(a2*b2, bufx_a);
279 stack_->release(a2*b2, bufz);
280 stack_->release(a2*b2, bufy);
281 stack_->release(a2*b2, bufx);
282
283 stack_->release(worksize, workz);
284 stack_->release(worksize, worky);
285 stack_->release(worksize, workx);
286
287 stack_->release((amax_+1)*a2*b2, transz);
288 stack_->release((amax_+1)*a2*b2, transy);
289 stack_->release((amax_+1)*a2*b2, transx);
290
291 const size_t acpsize = acsize*cont0_*cont1_;
292 double* const bkup = stack_->get(acpsize);
293 double* cdata = data_;
294 const SortList sort(spherical_);
295 for (int i = 0; i != 3; ++i, cdata += size_block_) {
296 const double* source = cdata;
297 double* target = bkup;
298 perform_contraction(acsize, source, prim0_, prim1_, target,
299 basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
300 basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
301
302 if (spherical_) {
303 const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
304 const int nloops = cont0_ * cont1_;
305 source = bkup;
306 target = cdata;
307 carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
308
309 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
310 source = cdata;
311 target = bkup;
312 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
313 copy_n(bkup, acpsize, cdata);
314 } else {
315 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
316 source = bkup;
317 target = cdata;
318 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
319 }
320 }
321
322 if (!swap01_)
323 for (int i = 0; i != 3; ++i) {
324 blas::ax_plus_y_n(1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
325 blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+i*size_block_);
326 }
327
328 stack_->release(acpsize, bkup);
329 }
330