1 /*  Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
2 
3     Licensed under the Apache License, Version 2.0 (the "License");
4     you may not use this file except in compliance with the License.
5     You may obtain a copy of the License at
6 
7         http://www.apache.org/licenses/LICENSE-2.0
8 
9     Unless required by applicable law or agreed to in writing, software
10     distributed under the License is distributed on an "AS IS" BASIS,
11     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12     See the License for the specific language governing permissions and
13     limitations under the License.
14 
15  *
16  *  Author: Oliver J. Backhouse <olbackhouse@gmail.com>
17  *          Alejandro Santana-Bonilla <alejandro.santana_bonilla@kcl.ac.uk>
18  *          George H. Booth <george.booth@kcl.ac.uk>
19  */
20 
21 #include<stdlib.h>
22 #include<assert.h>
23 #include<math.h>
24 
25 //#include "omp.h"
26 #include "config.h"
27 #include "vhf/fblas.h"
28 
29 
30 
31 /*
32  *  b_x = alpha * a_x + beta * b_x
33  */
AGF2sum_inplace(double * a,double * b,int x,double alpha,double beta)34 void AGF2sum_inplace(double *a,
35                      double *b,
36                      int x,
37                      double alpha,
38                      double beta)
39 {
40     int i;
41 
42     for (i = 0; i < x; i++) {
43         b[i] *= beta;
44         b[i] += alpha * a[i];
45     }
46 }
47 
48 
49 /*
50  *  b_x = a_x * b_x
51  */
AGF2prod_inplace(double * a,double * b,int x)52 void AGF2prod_inplace(double *a,
53                       double *b,
54                       int x)
55 {
56     int i;
57 
58     for (i = 0; i < x; i++) {
59         b[i] *= a[i];
60     }
61 }
62 
63 
64 /*
65  *  c_x = a_x * b_x
66  */
AGF2prod_outplace(double * a,double * b,int x,double * c)67 void AGF2prod_outplace(double *a,
68                        double *b,
69                        int x,
70                        double *c)
71 {
72     int i;
73 
74     for (i = 0; i < x; i++) {
75         c[i] = a[i] * b[i];
76     }
77 }
78 
79 
80 /*
81  *  b_xz = a_xiz
82  */
AGF2slice_0i2(double * a,int x,int y,int z,int idx,double * b)83 void AGF2slice_0i2(double *a,
84                    int x,
85                    int y,
86                    int z,
87                    int idx,
88                    double *b)
89 {
90     double *pa, *pb;
91     int i, k;
92 
93     for (i = 0; i < x; i++) {
94         pb = b + i*z;
95         pa = a + i*y*z + idx*z;
96         for (k = 0; k < z; k++) {
97             pb[k] = pa[k];
98         }
99     }
100 }
101 
102 
103 /*
104  *  b_xy = a_xyi
105  */
AGF2slice_01i(double * a,int x,int y,int z,int idx,double * b)106 void AGF2slice_01i(double *a,
107                    int x,
108                    int y,
109                    int z,
110                    int idx,
111                    double *b)
112 {
113     double *pa, *pb;
114     int i, j;
115 
116     for (i = 0; i < x; i++) {
117         pb = b + i*y;
118         pa = a + i*y*z + idx;
119         for (j = 0; j < y; j++) {
120             pb[j] = pa[j*z];
121         }
122     }
123 }
124 
125 
126 /*
127  *  d_xy = a + b_x - c_y
128  */
AGF2sum_inplace_ener(double a,double * b,double * c,int x,int y,double * d)129 void AGF2sum_inplace_ener(double a,
130                   double *b,
131                   double *c,
132                   int x,
133                   int y,
134                   double *d)
135 {
136     double *pd;
137     int i, j;
138 
139     for (i = 0; i < x; i++) {
140         pd = d + i*y;
141         for (j = 0; j < y; j++) {
142             pd[j] = a + b[i] - c[j];
143         }
144     }
145 }
146 
147 
148 /*
149  *  b_xy = a_y * b_xy
150  */
AGF2prod_inplace_ener(double * a,double * b,int x,int y)151 void AGF2prod_inplace_ener(double *a,
152                            double *b,
153                            int x,
154                            int y)
155 {
156     double *pb;
157     int i;
158 
159     for (i = 0; i < x; i++) {
160         pb = b + i*y;
161         AGF2prod_inplace(a, pb, y);
162     }
163 }
164 
165 
166 /*
167  *   c_xy = a_y * b_xy
168  */
AGF2prod_outplace_ener(double * a,double * b,int x,int y,double * c)169 void AGF2prod_outplace_ener(double *a,
170                             double *b,
171                             int x,
172                             int y,
173                             double *c)
174 {
175     double *pb, *pc;
176     int i;
177 
178     for (i = 0; i < x; i++) {
179         pb = b + i*y;
180         pc = c + i*y;
181         AGF2prod_outplace(a, pb, y, pc);
182     }
183 }
184 
185 
186 /*
187  *  exact ERI
188  *  vv_xy = (xi|ja) [2(yi|ja) - (yj|ia)]
189  *  vev_xy = (xi|ja) [2(yi|ja) - (yj|ia)] (ei + ej - ea)
190  */
AGF2ee_vv_vev_islice(double * xija,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int istart,int iend,double * vv,double * vev)191 void AGF2ee_vv_vev_islice(double *xija,
192                           double *e_i,
193                           double *e_a,
194                           double os_factor,
195                           double ss_factor,
196                           int nmo,
197                           int nocc,
198                           int nvir,
199                           int istart,
200                           int iend,
201                           double *vv,
202                           double *vev)
203 {
204     const double D1 = 1;
205     const char TRANS_T = 'T';
206     const char TRANS_N = 'N';
207 
208     const int nja = nocc * nvir;
209     const int nxi = nmo * nocc;
210     const double fpos = os_factor + ss_factor;
211     const double fneg = -1.0 * ss_factor;
212 
213 #pragma omp parallel
214 {
215     double *eja = calloc(nocc*nvir, sizeof(double));
216     double *xia = calloc(nmo*nocc*nvir, sizeof(double));
217     double *xja = calloc(nmo*nocc*nvir, sizeof(double));
218 
219     double *vv_priv = calloc(nmo*nmo, sizeof(double));
220     double *vev_priv = calloc(nmo*nmo, sizeof(double));
221 
222     int i;
223 
224 #pragma omp for
225     for (i = istart; i < iend; i++) {
226         // build xija
227         AGF2slice_0i2(xija, nmo, nocc, nja, i, xja);
228 
229         // build xjia
230         AGF2slice_0i2(xija, nxi, nocc, nvir, i, xia);
231 
232         // build eija = ei + ej - ea
233         AGF2sum_inplace_ener(e_i[i], e_i, e_a, nocc, nvir, eja);
234 
235         // inplace xjia = 2 * xija - xjia
236         AGF2sum_inplace(xja, xia, nmo*nja, fpos, fneg);
237 
238         // vv_xy += xija * (2 yija - yjia)
239         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vv_priv, &nmo);
240 
241         // inplace xija = eija * xija
242         AGF2prod_inplace_ener(eja, xja, nmo, nja);
243 
244         // vev_xy += xija * eija * (2 yija - yjia)
245         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vev_priv, &nmo);
246     }
247 
248     free(eja);
249     free(xia);
250     free(xja);
251 
252 #pragma omp critical
253     for (i = 0; i < (nmo*nmo); i++) {
254         vv[i] += vv_priv[i];
255         vev[i] += vev_priv[i];
256     }
257 
258     free(vv_priv);
259     free(vev_priv);
260 }
261 }
262 
263 
264 /*
265  *  density fitting
266  *  (xi|ja) = (xi|Q)(Q|ja)
267  *  vv_xy = (xi|ja) [2(yi|ja) - (yj|ia)]
268  *  vev_xy = (xi|ja) [2(yi|ja) - (yj|ia)] (ei + ej - ea)
269  */
AGF2df_vv_vev_islice(double * qxi,double * qja,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int naux,int istart,int iend,double * vv,double * vev)270 void AGF2df_vv_vev_islice(double *qxi,
271                           double *qja,
272                           double *e_i,
273                           double *e_a,
274                           double os_factor,
275                           double ss_factor,
276                           int nmo,
277                           int nocc,
278                           int nvir,
279                           int naux,
280                           int istart,
281                           int iend,
282                           double *vv,
283                           double *vev)
284 {
285     const double D0 = 0.0;
286     const double D1 = 1.0;
287     const char TRANS_T = 'T';
288     const char TRANS_N = 'N';
289 
290     const int nja = nocc * nvir;
291     const int nxi = nmo * nocc;
292     const double fpos = os_factor + ss_factor;
293     const double fneg = -1.0 * ss_factor;
294 
295 #pragma omp parallel
296 {
297     double *qa = calloc(naux*nvir, sizeof(double));
298     double *qx = calloc(naux*nmo, sizeof(double));
299     double *eja = calloc(nocc*nvir, sizeof(double));
300     double *xia = calloc(nmo*nocc*nvir, sizeof(double));
301     double *xja = calloc(nmo*nocc*nvir, sizeof(double));
302 
303     double *vv_priv = calloc(nmo*nmo, sizeof(double));
304     double *vev_priv = calloc(nmo*nmo, sizeof(double));
305 
306     int i;
307 
308 #pragma omp for
309     for (i = istart; i < iend; i++) {
310         // build qx
311         AGF2slice_01i(qxi, naux, nmo, nocc, i, qx);
312 
313         // build qa
314         AGF2slice_0i2(qja, naux, nocc, nvir, i, qa);
315 
316         // build xija = xq * qja
317         dgemm_(&TRANS_N, &TRANS_T, &nja, &nmo, &naux, &D1, qja, &nja, qx, &nmo, &D0, xja, &nja);
318 
319         // build xjia = xiq * qa
320         dgemm_(&TRANS_N, &TRANS_T, &nvir, &nxi, &naux, &D1, qa, &nvir, qxi, &nxi, &D0, xia, &nvir);
321         //printf("%13.9f %13.9f\n", xja[10], xia[10]); fflush(stdout);
322 
323         // build eija = ei + ej - ea
324         AGF2sum_inplace_ener(e_i[i], e_i, e_a, nocc, nvir, eja);
325 
326         // inplace xjia = 2 * xija - xjia
327         AGF2sum_inplace(xja, xia, nmo*nja, fpos, fneg);
328 
329         // vv_xy += xija * (2 yija - yjia)
330         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vv_priv, &nmo);
331 
332         // inplace xija = eija * xija
333         AGF2prod_inplace_ener(eja, xja, nmo, nja);
334 
335         // vev_xy += xija * eija * (2 yija - yjia)
336         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vev_priv, &nmo);
337     }
338 
339     free(qa);
340     free(qx);
341     free(eja);
342     free(xia);
343     free(xja);
344 
345 #pragma omp critical
346     for (i = 0; i < (nmo*nmo); i++) {
347         vv[i] += vv_priv[i];
348         vev[i] += vev_priv[i];
349     }
350 
351     free(vv_priv);
352     free(vev_priv);
353 }
354 }
355 
356 
357 /*
358  *  Removes an index from DGEMM and into a for loop to reduce the
359  *  thread-private memory overhead, at the cost of serial speed
360  */
AGF2df_vv_vev_islice_lowmem(double * qxi,double * qja,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int naux,int start,int end,double * vv,double * vev)361 void AGF2df_vv_vev_islice_lowmem(double *qxi,
362                                  double *qja,
363                                  double *e_i,
364                                  double *e_a,
365                                  double os_factor,
366                                  double ss_factor,
367                                  int nmo,
368                                  int nocc,
369                                  int nvir,
370                                  int naux,
371                                  int start,
372                                  int end,
373                                  double *vv,
374                                  double *vev)
375 {
376     const double D0 = 0.0;
377     const double D1 = 1.0;
378     const char TRANS_T = 'T';
379     const char TRANS_N = 'N';
380     const int one = 1;
381 
382     const double fpos = os_factor + ss_factor;
383     const double fneg = -1.0 * ss_factor;
384 
385 //#pragma omp parallel
386 //{
387 //    double *xj = calloc(nmo*nocc, sizeof(double));
388 //    double *xi = calloc(nmo*nocc, sizeof(double));
389 //    double *qx = calloc(naux*nmo, sizeof(double));
390 //    double *qj = calloc(naux*nocc, sizeof(double));
391 //    double *ej = calloc(nocc, sizeof(double));
392 //
393 //    double *vv_priv = calloc(nmo*nmo, sizeof(double));
394 //    double *vev_priv = calloc(nmo*nmo, sizeof(double));
395 //
396 //    int i, a, ia;
397 //
398 //#pragma omp for
399 //    for (ia = start; ia < end; ia++) {
400 //        i = ia / nvir;
401 //        a = ia % nvir;
402 //
403 //        // build qx
404 //        AGF2slice_01i(qxi, naux, nmo, nocc, i, qx);
405 //
406 //        // build qj
407 //        AGF2slice_01i(qja, naux, nocc, nvir, a, qj);
408 //
409 //        // build xj = xq * qj
410 //        dgemm_(&TRANS_N, &TRANS_T, &nocc, &nmo, &naux, &D1, qj, &nocc, qx, &nmo, &D0, xj, &nocc);
411 //
412 //        // build xi = xiq * q    is this slow without incx=1?
413 //        dgemv_(&TRANS_N, &nxi, &naux, &D1, qxi, &nxi, &(qja[i*nvir+a]), &nja, &D0, xi, &one);
414 //
415 //        // build eija = ei + ej - ea
416 //        AGF2sum_inplace_ener(e_i[i], e_i, &(e_a[a]), nocc, one, ej);
417 //
418 //        // inplace xi = 2 * xj - xi
419 //        AGF2sum_inplace(xi, xj, nxi, fpos, fneg);
420 //
421 //        // vv_xy += xj * (2 * yj - yi)
422 //        dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nocc, &D1, xi, &nocc, xj, &nocc, &D1, vv_priv, &nmo);
423 //
424 //        // inplace xj = ej * xj
425 //        AGF2prod_inplace_ener(ej, xj, nmo, nocc);
426 //
427 //        // vev_xy += xj * ej * (2 * yj - yi)
428 //        dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nocc, &D1, xi, &nocc, xj, &nocc, &D1, vev_priv, &nmo);
429 //    }
430 //
431 //    free(xj);
432 //    free(xi);
433 //    free(qx);
434 //    free(qj);
435 //    free(ej);
436 
437 #pragma omp parallel
438 {
439     double *qx_i = calloc(naux*nmo, sizeof(double));
440     double *qx_j = calloc(naux*nmo, sizeof(double));
441     double *qa_i = calloc(naux*nvir, sizeof(double));
442     double *qa_j = calloc(naux*nvir, sizeof(double));
443     double *xa_i = calloc(nmo*nvir, sizeof(double));
444     double *xa_j = calloc(nmo*nvir, sizeof(double));
445     double *ea = calloc(nvir, sizeof(double));
446 
447     double *vv_priv = calloc(nmo*nmo, sizeof(double));
448     double *vev_priv = calloc(nmo*nmo, sizeof(double));
449 
450     int i, j, ij;
451 
452 #pragma omp for
453     for (ij = start; ij < end; ij++) {
454         i = ij / nocc;
455         j = ij % nocc;
456 
457         // build qx_i
458         AGF2slice_01i(qxi, naux, nmo, nocc, i, qx_i);
459 
460         // build qx_j
461         AGF2slice_01i(qxi, naux, nmo, nocc, j, qx_j);
462 
463         // build qa_i
464         AGF2slice_0i2(qja, naux, nocc, nvir, i, qa_i);
465 
466         // build qa_j
467         AGF2slice_0i2(qja, naux, nocc, nvir, j, qa_j);
468 
469         // build xija
470         dgemm_(&TRANS_N, &TRANS_T, &nvir, &nmo, &naux, &D1, qa_i, &nvir, qx_j, &nmo, &D0, xa_i, &nvir);
471 
472         // build xjia
473         dgemm_(&TRANS_N, &TRANS_T, &nvir, &nmo, &naux, &D1, qa_j, &nvir, qx_i, &nmo, &D0, xa_j, &nvir);
474 
475         // build eija = ei + ej - ea
476         AGF2sum_inplace_ener(e_i[i], &(e_i[j]), e_a, one, nvir, ea);
477 
478         // inplace xjia = 2 * xija - xjia
479         AGF2sum_inplace(xa_j, xa_i, nmo*nvir, fpos, fneg);
480 
481         // vv_xy += xija * (2 * yija - yjia)
482         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nvir, &D1, xa_j, &nvir, xa_i, &nvir, &D1, vv_priv, &nmo);
483 
484         // inplace xija = eija * xija
485         AGF2prod_inplace_ener(ea, xa_i, nmo, nvir);
486 
487         // vv_xy += xija * (2 * yija - yjia)
488         dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nvir, &D1, xa_j, &nvir, xa_i, &nvir, &D1, vev_priv, &nmo);
489     }
490 
491     free(qx_i);
492     free(qx_j);
493     free(qa_i);
494     free(qa_j);
495     free(xa_i);
496     free(xa_j);
497     free(ea);
498 
499 #pragma omp critical
500     for (i = 0; i < (nmo*nmo); i++) {
501         vv[i] += vv_priv[i];
502         vev[i] += vev_priv[i];
503     }
504 
505     free(vv_priv);
506     free(vev_priv);
507 }
508 }
509