1 //
2 // BAGEL - Brilliantly Advanced General Electronic Structure Library
3 // Filename: gkcompute.cc
4 // Copyright (C) 2012 Toru Shiozaki
5 //
6 // Author: Toru Shiozaki <shiozaki@northwestern.edu>
7 // Maintainer: Shiozaki group
8 //
9 // This file is part of the BAGEL package.
10 //
11 // This program is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU General Public License
22 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 //
24 
25 
26 #include <src/integral/carsphlist.h>
27 #include <src/integral/os/gkineticbatch.h>
28 #include <src/util/math/comb.h>
29 
30 using namespace std;
31 using namespace bagel;
32 
33 const static Comb comb;
34 const static CarSphList carsphlist;
35 
compute()36 void GKineticBatch::compute() {
37 
38   fill_n(data_, size_alloc_, 0.0);
39 
40   const int a = basisinfo_[0]->angular_number();
41   const int b = basisinfo_[1]->angular_number();
42   const int a2 = a+1+nrank();
43   const int b2 = b+1+nrank();
44   assert(amax_ == a+b+nrank());
45   const int acsize = (a+1)*(a+2)*(b+1)*(b+2)/4;
46 
47   double* const transx = stack_->get((amax_+1)*a2*b2);
48   double* const transy = stack_->get((amax_+1)*a2*b2);
49   double* const transz = stack_->get((amax_+1)*a2*b2);
50   fill_n(transx, (amax_+1)*a2*b2, 0.0);
51   fill_n(transy, (amax_+1)*a2*b2, 0.0);
52   fill_n(transz, (amax_+1)*a2*b2, 0.0);
53   for (int ib = 0, k = 0; ib <= b+nrank(); ++ib) {
54     for (int ia = 0; ia <= a+nrank(); ++ia, ++k) {
55       if (ia > a && ib > b) continue;
56       for (int i = ia; i <= ia+ib; ++i) {
57         transx[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[0], ia+ib-i);
58         transy[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[1], ia+ib-i);
59         transz[i + (amax_+1)*k] = comb(ib, ia+ib-i) * pow(AB_[2], ia+ib-i);
60       }
61     }
62   }
63   const int worksize = amax_+1;
64   double* const workx = stack_->get(worksize);
65   double* const worky = stack_->get(worksize);
66   double* const workz = stack_->get(worksize);
67 
68   double* const bufx = stack_->get(a2*b2);
69   double* const bufy = stack_->get(a2*b2);
70   double* const bufz = stack_->get(a2*b2);
71   double* const bufx_a = stack_->get(a2*b2*nrank());
72   double* const bufy_a = stack_->get(a2*b2*nrank());
73   double* const bufz_a = stack_->get(a2*b2*nrank());
74 
75   // Perform VRR
76   for (int ii = 0; ii != prim0_ * prim1_; ++ii) {
77 
78     /// Sx(0 : i+j+1, 0) will be made here
79     workx[0] = coeffsx_[ii];
80     worky[0] = coeffsy_[ii];
81     workz[0] = coeffsz_[ii];
82     workx[1] = (P_[ii*3    ] - basisinfo_[0]->position(0)) * workx[0];
83     worky[1] = (P_[ii*3 + 1] - basisinfo_[0]->position(1)) * worky[0];
84     workz[1] = (P_[ii*3 + 2] - basisinfo_[0]->position(2)) * workz[0];
85     for (int i = 2; i <= amax_; ++i) {
86       workx[i] = (P_[ii*3    ] - basisinfo_[0]->position(0)) * workx[i - 1] + 0.5 * (i - 1) / xp_[ii] * workx[i - 2];
87       worky[i] = (P_[ii*3 + 1] - basisinfo_[0]->position(1)) * worky[i - 1] + 0.5 * (i - 1) / xp_[ii] * worky[i - 2];
88       workz[i] = (P_[ii*3 + 2] - basisinfo_[0]->position(2)) * workz[i - 1] + 0.5 * (i - 1) / xp_[ii] * workz[i - 2];
89     }
90     // HRR is done in one shot
91     dgemv_("T", amax_+1, a2*b2, 1.0, transx, amax_+1, workx, 1, 0.0, bufx, 1);
92     dgemv_("T", amax_+1, a2*b2, 1.0, transy, amax_+1, worky, 1, 0.0, bufy, 1);
93     dgemv_("T", amax_+1, a2*b2, 1.0, transz, amax_+1, workz, 1, 0.0, bufz, 1);
94 
95 
96     const double alpha = xa_[ii];
97     const double* tmpx = bufx;
98     const double* tmpy = bufy;
99     const double* tmpz = bufz;
100     for (int i = 0; i != nrank(); ++i) {
101       const int rem = nrank()-i-1;
102       for (int ib = 0; ib <= b+rem; ++ib) {
103         for (int ia = 0; ia <= a+rem; ++ia) {
104           bufx_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpx[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpx[ia-1+a2*(ib)] : 0);
105           bufy_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpy[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpy[ia-1+a2*(ib)] : 0);
106           bufz_a[ia+a2*ib+a2*b2*i] = 2.0*alpha*tmpz[ia+1+a2*(ib)] - (ia != 0 ? ia*tmpz[ia-1+a2*(ib)] : 0);
107         }
108       }
109       tmpx = bufx_a+a2*b2*i;
110       tmpy = bufy_a+a2*b2*i;
111       tmpz = bufz_a+a2*b2*i;
112     }
113 
114     /// assembly process
115     const int offset_ii = ii * acsize;
116     double* current_data0 = data_ + offset_ii;
117     double* current_data1 = data_ + offset_ii + size_block_;
118     double* current_data2 = data_ + offset_ii + size_block_*2;
119 
120     for (int iaz = 0; iaz <= a; ++iaz) {
121       for (int iay = 0; iay <= a - iaz; ++iay) {
122         const int iax = a - iaz - iay;
123         for (int ibz = 0; ibz <= b; ++ibz) {
124           for (int iby = 0; iby <= b - ibz; ++iby) {
125             const int ibx = b - ibz - iby;
126 
127             *current_data0++ += bufx_a[iax+a2*ibx+a2*b2*2] * bufy  [iay+a2*iby        ] * bufz  [iaz+a2*ibz        ]
128                               + bufx_a[iax+a2*ibx        ] * bufy_a[iay+a2*iby+a2*b2*1] * bufz  [iaz+a2*ibz        ]
129                               + bufx_a[iax+a2*ibx        ] * bufy  [iay+a2*iby        ] * bufz_a[iaz+a2*ibz+a2*b2*1];
130             *current_data1++ += bufx_a[iax+a2*ibx+a2*b2*1] * bufy_a[iay+a2*iby        ] * bufz  [iaz+a2*ibz        ]
131                               + bufx  [iax+a2*ibx        ] * bufy_a[iay+a2*iby+a2*b2*2] * bufz  [iaz+a2*ibz        ]
132                               + bufx  [iax+a2*ibx        ] * bufy_a[iay+a2*iby        ] * bufz_a[iaz+a2*ibz+a2*b2*1];
133             *current_data2++ += bufx_a[iax+a2*ibx+a2*b2*1] * bufy  [iay+a2*iby        ] * bufz_a[iaz+a2*ibz        ]
134                               + bufx  [iax+a2*ibx        ] * bufy_a[iay+a2*iby+a2*b2*1] * bufz_a[iaz+a2*ibz        ]
135                               + bufx  [iax+a2*ibx        ] * bufy  [iay+a2*iby        ] * bufz_a[iaz+a2*ibz+a2*b2*2];
136           }
137         }
138       }
139     }
140 
141   } // end of primsize loop
142 
143   stack_->release(a2*b2*nrank(), bufz_a);
144   stack_->release(a2*b2*nrank(), bufy_a);
145   stack_->release(a2*b2*nrank(), bufx_a);
146 
147   stack_->release(a2*b2, bufz);
148   stack_->release(a2*b2, bufy);
149   stack_->release(a2*b2, bufx);
150 
151   stack_->release(worksize, workz);
152   stack_->release(worksize, worky);
153   stack_->release(worksize, workx);
154 
155   stack_->release((amax_+1)*a2*b2, transz);
156   stack_->release((amax_+1)*a2*b2, transy);
157   stack_->release((amax_+1)*a2*b2, transx);
158 
159   const size_t acpsize = acsize*cont0_*cont1_;
160   double* const bkup = stack_->get(acpsize);
161   double* cdata = data_;
162   const SortList sort(spherical_);
163   for (int i = 0; i != 3; ++i, cdata += size_block_) {
164     // first, contraction.
165     const double* source = cdata;
166     double* target = bkup;
167     perform_contraction(acsize, source, prim0_, prim1_, target,
168                         basisinfo_[0]->contractions(), basisinfo_[0]->contraction_ranges(), cont0_,
169                         basisinfo_[1]->contractions(), basisinfo_[1]->contraction_ranges(), cont1_);
170 
171     if (spherical_) {
172       const unsigned int carsph_index = basisinfo_[0]->angular_number() * ANG_HRR_END + basisinfo_[1]->angular_number();
173       const int nloops = cont0_ * cont1_;
174       source = bkup;
175       target = cdata;
176       carsphlist.carsphfunc_call(carsph_index, nloops, source, target);
177 
178       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
179       source = cdata;
180       target = bkup;
181       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
182       copy_n(bkup, acpsize, cdata);
183     } else {
184       const unsigned int sort_index = basisinfo_[1]->angular_number() * ANG_HRR_END + basisinfo_[0]->angular_number();
185       source = bkup;
186       target = cdata;
187       sort.sortfunc_call(sort_index, target, source, cont1_, cont0_, 1, swap01_);
188     }
189     // since this is a kinetic operator
190     blas::scale_n(-0.5, cdata, cont0_*cont1_*acsize);
191   }
192 
193   for (int i = 0; i != 3; ++i)
194     blas::ax_plus_y_n(-1.0, data_+i*size_block_, cont0_*cont1_*acsize, data_+(i+3)*size_block_);
195 
196   stack_->release(acpsize, bkup);
197 
198 }
199 
200