1 //
2 // BAGEL - Brilliantly Advanced General Electronic Structure Library
3 // Filename: gkcompute.cc
4 // Copyright (C) 2012 Toru Shiozaki
5 //
6 // Author: Toru Shiozaki <shiozaki@northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // This program is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU General Public License
22 // along with this program. If not, see <http://www.gnu.org/licenses/>.
23 //
24
25
26 #include <src/integral/carsphlist.h>
27 #include <src/integral/os/gkineticbatch.h>
28 #include <src/util/math/comb.h>
29
30 using namespace std;
31 using namespace bagel;
32
33 const static Comb comb;
34 const static CarSphList carsphlist;
35
compute()36 void GKineticBatch::compute() {
37
38 fill_n(data_, size_alloc_, 0.0);
39
40 const int a = basisinfo_[0]->angular_number();
41 const int b = basisinfo_[1]->angular_number();
42 const int a2 = a+1+nrank();
43 const int b2 = b+1+nrank();
44 assert(amax_ == a+b+nrank());
45 const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
46
47 double* const transx = stack_->get((amax_+1)*a2*b2);
48 double* const transy = stack_->get((amax_+1)*a2*b2);
49 double* const transz = stack_->get((amax_+1)*a2*b2);
50 fill_n(transx, (amax_+1)*a2*b2, 0.0);
51 fill_n(transy, (amax_+1)*a2*b2, 0.0);
52 fill_n(transz, (amax_+1)*a2*b2, 0.0);
53 for (int ib = 0, k = 0; ib <= b+nrank(); ++ib) {
54 for (int ia = 0; ia <= a+nrank(); ++ia, ++k) {
55 if (ia > a && ib > b) continue;
56 for (int i = ia; i <= ia+ib; ++i) {
57 transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
58 transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
59 transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
60 }
61 }
62 }
63 const int worksize = amax_+1;
64 double* const workx = stack_->get(worksize);
65 double* const worky = stack_->get(worksize);
66 double* const workz = stack_->get(worksize);
67
68 double* const bufx = stack_->get(a2*b2);
69 double* const bufy = stack_->get(a2*b2);
70 double* const bufz = stack_->get(a2*b2);
71 double* const bufx_a = stack_->get(a2*b2*nrank());
72 double* const bufy_a = stack_->get(a2*b2*nrank());
73 double* const bufz_a = stack_->get(a2*b2*nrank());
74
75 // Perform VRR
76 for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
77
78 /// Sx(0 : i+j+1, 0) will be made here
79 workx[0] = coeffsx_[ii];
80 worky[0] = coeffsy_[ii];
81 workz[0] = coeffsz_[ii];
82 workx[1] = (P_[ii*3 ] - basisinfo_[0]->position(0)) * workx[0];
83 worky[1] = (P_[ii*3 + 1] - basisinfo_[0]->position(1)) * worky[0];
84 workz[1] = (P_[ii*3 + 2] - basisinfo_[0]->position(2)) * workz[0];
85 for (int i = 2; i <= amax_; ++i) {
86 workx[i] = (P_[ii*3 ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
87 worky[i] = (P_[ii*3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
88 workz[i] = (P_[ii*3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
89 }
90 // HRR is done in one shot
91 dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
92 dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
93 dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
94
95
96 const double alpha = xa_[ii];
97 const double* tmpx = bufx;
98 const double* tmpy = bufy;
99 const double* tmpz = bufz;
100 for (int i = 0; i != nrank(); ++i) {
101 const int rem = nrank()-i-1;
102 for (int ib = 0; ib <= b+rem; ++ib) {
103 for (int ia = 0; ia <= a+rem; ++ia) {
104 bufx_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpx[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpx[ia-1+a2*(ib)] : 0);
105 bufy_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpy[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpy[ia-1+a2*(ib)] : 0);
106 bufz_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpz[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpz[ia-1+a2*(ib)] : 0);
107 }
108 }
109 tmpx = bufx_a+a2*b2*i;
110 tmpy = bufy_a+a2*b2*i;
111 tmpz = bufz_a+a2*b2*i;
112 }
113
114 /// assembly process
115 const int offset_ii = ii * acsize;
116 double* current_data0 = data_ + offset_ii;
117 double* current_data1 = data_ + offset_ii + size_block_;
118 double* current_data2 = data_ + offset_ii + size_block_*2;
119
120 for (int iaz = 0; iaz <= a; ++iaz) {
121 for (int iay = 0; iay <= a - iaz; ++iay) {
122 const int iax = a - iaz - iay;
123 for (int ibz = 0; ibz <= b; ++ibz) {
124 for (int iby = 0; iby <= b - ibz; ++iby) {
125 const int ibx = b - ibz - iby;
126
127 *current_data0++ += bufx_a[iax+a2*ibx+a2*b2*2] * bufy [iay+a2*iby ] * bufz [iaz+a2*ibz ]
128 + bufx_a[iax+a2*ibx ] * bufy_a[iay+a2*iby+a2*b2*1] * bufz [iaz+a2*ibz ]
129 + bufx_a[iax+a2*ibx ] * bufy [iay+a2*iby ] * bufz_a[iaz+a2*ibz+a2*b2*1];
130 *current_data1++ += bufx_a[iax+a2*ibx+a2*b2*1] * bufy_a[iay+a2*iby ] * bufz [iaz+a2*ibz ]
131 + bufx [iax+a2*ibx ] * bufy_a[iay+a2*iby+a2*b2*2] * bufz [iaz+a2*ibz ]
132 + bufx [iax+a2*ibx ] * bufy_a[iay+a2*iby ] * bufz_a[iaz+a2*ibz+a2*b2*1];
133 *current_data2++ += bufx_a[iax+a2*ibx+a2*b2*1] * bufy [iay+a2*iby ] * bufz_a[iaz+a2*ibz ]
134 + bufx [iax+a2*ibx ] * bufy_a[iay+a2*iby+a2*b2*1] * bufz_a[iaz+a2*ibz ]
135 + bufx [iax+a2*ibx ] * bufy [iay+a2*iby ] * bufz_a[iaz+a2*ibz+a2*b2*2];
136 }
137 }
138 }
139 }
140
141 } // end of primsize loop
142
143 stack_->release(a2*b2*nrank(), bufz_a);
144 stack_->release(a2*b2*nrank(), bufy_a);
145 stack_->release(a2*b2*nrank(), bufx_a);
146
147 stack_->release(a2*b2, bufz);
148 stack_->release(a2*b2, bufy);
149 stack_->release(a2*b2, bufx);
150
151 stack_->release(worksize, workz);
152 stack_->release(worksize, worky);
153 stack_->release(worksize, workx);
154
155 stack_->release((amax_+1)*a2*b2, transz);
156 stack_->release((amax_+1)*a2*b2, transy);
157 stack_->release((amax_+1)*a2*b2, transx);
158
159 const size_t acpsize = acsize*cont0_*cont1_;
160 double* const bkup = stack_->get(acpsize);
161 double* cdata = data_;
162 const SortList sort(spherical_);
163 for (int i = 0; i != 3; ++i, cdata += size_block_) {
164 // first, contraction.
165 const double* source = cdata;
166 double* target = bkup;
167 perform_contraction(acsize, source, prim0_, prim1_, target,
168 basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
169 basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
170
171 if (spherical_) {
172 const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
173 const int nloops = cont0_ * cont1_;
174 source = bkup;
175 target = cdata;
176 carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
177
178 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
179 source = cdata;
180 target = bkup;
181 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
182 copy_n(bkup, acpsize, cdata);
183 } else {
184 const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
185 source = bkup;
186 target = cdata;
187 sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
188 }
189 // since this is a kinetic operator
190 blas::scale_n(-0.5, cdata, cont0_*cont1_*acsize);
191 }
192
193 for (int i = 0; i != 3; ++i)
194 blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
195
196 stack_->release(acpsize, bkup);
197
198 }
199
200