1 /* Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14
15 *
16 * Author: Oliver J. Backhouse <olbackhouse@gmail.com>
17 * Alejandro Santana-Bonilla <alejandro.santana_bonilla@kcl.ac.uk>
18 * George H. Booth <george.booth@kcl.ac.uk>
19 */
20
21 #include<stdlib.h>
22 #include<assert.h>
23 #include<math.h>
24
25 //#include "omp.h"
26 #include "config.h"
27 #include "vhf/fblas.h"
28
29
30
31 /*
32 * b_x = alpha * a_x + beta * b_x
33 */
AGF2sum_inplace(double * a,double * b,int x,double alpha,double beta)34 void AGF2sum_inplace(double *a,
35 double *b,
36 int x,
37 double alpha,
38 double beta)
39 {
40 int i;
41
42 for (i = 0; i < x; i++) {
43 b[i] *= beta;
44 b[i] += alpha * a[i];
45 }
46 }
47
48
49 /*
50 * b_x = a_x * b_x
51 */
AGF2prod_inplace(double * a,double * b,int x)52 void AGF2prod_inplace(double *a,
53 double *b,
54 int x)
55 {
56 int i;
57
58 for (i = 0; i < x; i++) {
59 b[i] *= a[i];
60 }
61 }
62
63
64 /*
65 * c_x = a_x * b_x
66 */
AGF2prod_outplace(double * a,double * b,int x,double * c)67 void AGF2prod_outplace(double *a,
68 double *b,
69 int x,
70 double *c)
71 {
72 int i;
73
74 for (i = 0; i < x; i++) {
75 c[i] = a[i] * b[i];
76 }
77 }
78
79
80 /*
81 * b_xz = a_xiz
82 */
AGF2slice_0i2(double * a,int x,int y,int z,int idx,double * b)83 void AGF2slice_0i2(double *a,
84 int x,
85 int y,
86 int z,
87 int idx,
88 double *b)
89 {
90 double *pa, *pb;
91 int i, k;
92
93 for (i = 0; i < x; i++) {
94 pb = b + i*z;
95 pa = a + i*y*z + idx*z;
96 for (k = 0; k < z; k++) {
97 pb[k] = pa[k];
98 }
99 }
100 }
101
102
103 /*
104 * b_xy = a_xyi
105 */
AGF2slice_01i(double * a,int x,int y,int z,int idx,double * b)106 void AGF2slice_01i(double *a,
107 int x,
108 int y,
109 int z,
110 int idx,
111 double *b)
112 {
113 double *pa, *pb;
114 int i, j;
115
116 for (i = 0; i < x; i++) {
117 pb = b + i*y;
118 pa = a + i*y*z + idx;
119 for (j = 0; j < y; j++) {
120 pb[j] = pa[j*z];
121 }
122 }
123 }
124
125
126 /*
127 * d_xy = a + b_x - c_y
128 */
AGF2sum_inplace_ener(double a,double * b,double * c,int x,int y,double * d)129 void AGF2sum_inplace_ener(double a,
130 double *b,
131 double *c,
132 int x,
133 int y,
134 double *d)
135 {
136 double *pd;
137 int i, j;
138
139 for (i = 0; i < x; i++) {
140 pd = d + i*y;
141 for (j = 0; j < y; j++) {
142 pd[j] = a + b[i] - c[j];
143 }
144 }
145 }
146
147
148 /*
149 * b_xy = a_y * b_xy
150 */
AGF2prod_inplace_ener(double * a,double * b,int x,int y)151 void AGF2prod_inplace_ener(double *a,
152 double *b,
153 int x,
154 int y)
155 {
156 double *pb;
157 int i;
158
159 for (i = 0; i < x; i++) {
160 pb = b + i*y;
161 AGF2prod_inplace(a, pb, y);
162 }
163 }
164
165
166 /*
167 * c_xy = a_y * b_xy
168 */
AGF2prod_outplace_ener(double * a,double * b,int x,int y,double * c)169 void AGF2prod_outplace_ener(double *a,
170 double *b,
171 int x,
172 int y,
173 double *c)
174 {
175 double *pb, *pc;
176 int i;
177
178 for (i = 0; i < x; i++) {
179 pb = b + i*y;
180 pc = c + i*y;
181 AGF2prod_outplace(a, pb, y, pc);
182 }
183 }
184
185
186 /*
187 * exact ERI
188 * vv_xy = (xi|ja) [2(yi|ja) - (yj|ia)]
189 * vev_xy = (xi|ja) [2(yi|ja) - (yj|ia)] (ei + ej - ea)
190 */
AGF2ee_vv_vev_islice(double * xija,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int istart,int iend,double * vv,double * vev)191 void AGF2ee_vv_vev_islice(double *xija,
192 double *e_i,
193 double *e_a,
194 double os_factor,
195 double ss_factor,
196 int nmo,
197 int nocc,
198 int nvir,
199 int istart,
200 int iend,
201 double *vv,
202 double *vev)
203 {
204 const double D1 = 1;
205 const char TRANS_T = 'T';
206 const char TRANS_N = 'N';
207
208 const int nja = nocc * nvir;
209 const int nxi = nmo * nocc;
210 const double fpos = os_factor + ss_factor;
211 const double fneg = -1.0 * ss_factor;
212
213 #pragma omp parallel
214 {
215 double *eja = calloc(nocc*nvir, sizeof(double));
216 double *xia = calloc(nmo*nocc*nvir, sizeof(double));
217 double *xja = calloc(nmo*nocc*nvir, sizeof(double));
218
219 double *vv_priv = calloc(nmo*nmo, sizeof(double));
220 double *vev_priv = calloc(nmo*nmo, sizeof(double));
221
222 int i;
223
224 #pragma omp for
225 for (i = istart; i < iend; i++) {
226 // build xija
227 AGF2slice_0i2(xija, nmo, nocc, nja, i, xja);
228
229 // build xjia
230 AGF2slice_0i2(xija, nxi, nocc, nvir, i, xia);
231
232 // build eija = ei + ej - ea
233 AGF2sum_inplace_ener(e_i[i], e_i, e_a, nocc, nvir, eja);
234
235 // inplace xjia = 2 * xija - xjia
236 AGF2sum_inplace(xja, xia, nmo*nja, fpos, fneg);
237
238 // vv_xy += xija * (2 yija - yjia)
239 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vv_priv, &nmo);
240
241 // inplace xija = eija * xija
242 AGF2prod_inplace_ener(eja, xja, nmo, nja);
243
244 // vev_xy += xija * eija * (2 yija - yjia)
245 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vev_priv, &nmo);
246 }
247
248 free(eja);
249 free(xia);
250 free(xja);
251
252 #pragma omp critical
253 for (i = 0; i < (nmo*nmo); i++) {
254 vv[i] += vv_priv[i];
255 vev[i] += vev_priv[i];
256 }
257
258 free(vv_priv);
259 free(vev_priv);
260 }
261 }
262
263
264 /*
265 * density fitting
266 * (xi|ja) = (xi|Q)(Q|ja)
267 * vv_xy = (xi|ja) [2(yi|ja) - (yj|ia)]
268 * vev_xy = (xi|ja) [2(yi|ja) - (yj|ia)] (ei + ej - ea)
269 */
AGF2df_vv_vev_islice(double * qxi,double * qja,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int naux,int istart,int iend,double * vv,double * vev)270 void AGF2df_vv_vev_islice(double *qxi,
271 double *qja,
272 double *e_i,
273 double *e_a,
274 double os_factor,
275 double ss_factor,
276 int nmo,
277 int nocc,
278 int nvir,
279 int naux,
280 int istart,
281 int iend,
282 double *vv,
283 double *vev)
284 {
285 const double D0 = 0.0;
286 const double D1 = 1.0;
287 const char TRANS_T = 'T';
288 const char TRANS_N = 'N';
289
290 const int nja = nocc * nvir;
291 const int nxi = nmo * nocc;
292 const double fpos = os_factor + ss_factor;
293 const double fneg = -1.0 * ss_factor;
294
295 #pragma omp parallel
296 {
297 double *qa = calloc(naux*nvir, sizeof(double));
298 double *qx = calloc(naux*nmo, sizeof(double));
299 double *eja = calloc(nocc*nvir, sizeof(double));
300 double *xia = calloc(nmo*nocc*nvir, sizeof(double));
301 double *xja = calloc(nmo*nocc*nvir, sizeof(double));
302
303 double *vv_priv = calloc(nmo*nmo, sizeof(double));
304 double *vev_priv = calloc(nmo*nmo, sizeof(double));
305
306 int i;
307
308 #pragma omp for
309 for (i = istart; i < iend; i++) {
310 // build qx
311 AGF2slice_01i(qxi, naux, nmo, nocc, i, qx);
312
313 // build qa
314 AGF2slice_0i2(qja, naux, nocc, nvir, i, qa);
315
316 // build xija = xq * qja
317 dgemm_(&TRANS_N, &TRANS_T, &nja, &nmo, &naux, &D1, qja, &nja, qx, &nmo, &D0, xja, &nja);
318
319 // build xjia = xiq * qa
320 dgemm_(&TRANS_N, &TRANS_T, &nvir, &nxi, &naux, &D1, qa, &nvir, qxi, &nxi, &D0, xia, &nvir);
321 //printf("%13.9f %13.9f\n", xja[10], xia[10]); fflush(stdout);
322
323 // build eija = ei + ej - ea
324 AGF2sum_inplace_ener(e_i[i], e_i, e_a, nocc, nvir, eja);
325
326 // inplace xjia = 2 * xija - xjia
327 AGF2sum_inplace(xja, xia, nmo*nja, fpos, fneg);
328
329 // vv_xy += xija * (2 yija - yjia)
330 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vv_priv, &nmo);
331
332 // inplace xija = eija * xija
333 AGF2prod_inplace_ener(eja, xja, nmo, nja);
334
335 // vev_xy += xija * eija * (2 yija - yjia)
336 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nja, &D1, xia, &nja, xja, &nja, &D1, vev_priv, &nmo);
337 }
338
339 free(qa);
340 free(qx);
341 free(eja);
342 free(xia);
343 free(xja);
344
345 #pragma omp critical
346 for (i = 0; i < (nmo*nmo); i++) {
347 vv[i] += vv_priv[i];
348 vev[i] += vev_priv[i];
349 }
350
351 free(vv_priv);
352 free(vev_priv);
353 }
354 }
355
356
357 /*
358 * Removes an index from DGEMM and into a for loop to reduce the
359 * thread-private memory overhead, at the cost of serial speed
360 */
AGF2df_vv_vev_islice_lowmem(double * qxi,double * qja,double * e_i,double * e_a,double os_factor,double ss_factor,int nmo,int nocc,int nvir,int naux,int start,int end,double * vv,double * vev)361 void AGF2df_vv_vev_islice_lowmem(double *qxi,
362 double *qja,
363 double *e_i,
364 double *e_a,
365 double os_factor,
366 double ss_factor,
367 int nmo,
368 int nocc,
369 int nvir,
370 int naux,
371 int start,
372 int end,
373 double *vv,
374 double *vev)
375 {
376 const double D0 = 0.0;
377 const double D1 = 1.0;
378 const char TRANS_T = 'T';
379 const char TRANS_N = 'N';
380 const int one = 1;
381
382 const double fpos = os_factor + ss_factor;
383 const double fneg = -1.0 * ss_factor;
384
385 //#pragma omp parallel
386 //{
387 // double *xj = calloc(nmo*nocc, sizeof(double));
388 // double *xi = calloc(nmo*nocc, sizeof(double));
389 // double *qx = calloc(naux*nmo, sizeof(double));
390 // double *qj = calloc(naux*nocc, sizeof(double));
391 // double *ej = calloc(nocc, sizeof(double));
392 //
393 // double *vv_priv = calloc(nmo*nmo, sizeof(double));
394 // double *vev_priv = calloc(nmo*nmo, sizeof(double));
395 //
396 // int i, a, ia;
397 //
398 //#pragma omp for
399 // for (ia = start; ia < end; ia++) {
400 // i = ia / nvir;
401 // a = ia % nvir;
402 //
403 // // build qx
404 // AGF2slice_01i(qxi, naux, nmo, nocc, i, qx);
405 //
406 // // build qj
407 // AGF2slice_01i(qja, naux, nocc, nvir, a, qj);
408 //
409 // // build xj = xq * qj
410 // dgemm_(&TRANS_N, &TRANS_T, &nocc, &nmo, &naux, &D1, qj, &nocc, qx, &nmo, &D0, xj, &nocc);
411 //
412 // // build xi = xiq * q is this slow without incx=1?
413 // dgemv_(&TRANS_N, &nxi, &naux, &D1, qxi, &nxi, &(qja[i*nvir+a]), &nja, &D0, xi, &one);
414 //
415 // // build eija = ei + ej - ea
416 // AGF2sum_inplace_ener(e_i[i], e_i, &(e_a[a]), nocc, one, ej);
417 //
418 // // inplace xi = 2 * xj - xi
419 // AGF2sum_inplace(xi, xj, nxi, fpos, fneg);
420 //
421 // // vv_xy += xj * (2 * yj - yi)
422 // dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nocc, &D1, xi, &nocc, xj, &nocc, &D1, vv_priv, &nmo);
423 //
424 // // inplace xj = ej * xj
425 // AGF2prod_inplace_ener(ej, xj, nmo, nocc);
426 //
427 // // vev_xy += xj * ej * (2 * yj - yi)
428 // dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nocc, &D1, xi, &nocc, xj, &nocc, &D1, vev_priv, &nmo);
429 // }
430 //
431 // free(xj);
432 // free(xi);
433 // free(qx);
434 // free(qj);
435 // free(ej);
436
437 #pragma omp parallel
438 {
439 double *qx_i = calloc(naux*nmo, sizeof(double));
440 double *qx_j = calloc(naux*nmo, sizeof(double));
441 double *qa_i = calloc(naux*nvir, sizeof(double));
442 double *qa_j = calloc(naux*nvir, sizeof(double));
443 double *xa_i = calloc(nmo*nvir, sizeof(double));
444 double *xa_j = calloc(nmo*nvir, sizeof(double));
445 double *ea = calloc(nvir, sizeof(double));
446
447 double *vv_priv = calloc(nmo*nmo, sizeof(double));
448 double *vev_priv = calloc(nmo*nmo, sizeof(double));
449
450 int i, j, ij;
451
452 #pragma omp for
453 for (ij = start; ij < end; ij++) {
454 i = ij / nocc;
455 j = ij % nocc;
456
457 // build qx_i
458 AGF2slice_01i(qxi, naux, nmo, nocc, i, qx_i);
459
460 // build qx_j
461 AGF2slice_01i(qxi, naux, nmo, nocc, j, qx_j);
462
463 // build qa_i
464 AGF2slice_0i2(qja, naux, nocc, nvir, i, qa_i);
465
466 // build qa_j
467 AGF2slice_0i2(qja, naux, nocc, nvir, j, qa_j);
468
469 // build xija
470 dgemm_(&TRANS_N, &TRANS_T, &nvir, &nmo, &naux, &D1, qa_i, &nvir, qx_j, &nmo, &D0, xa_i, &nvir);
471
472 // build xjia
473 dgemm_(&TRANS_N, &TRANS_T, &nvir, &nmo, &naux, &D1, qa_j, &nvir, qx_i, &nmo, &D0, xa_j, &nvir);
474
475 // build eija = ei + ej - ea
476 AGF2sum_inplace_ener(e_i[i], &(e_i[j]), e_a, one, nvir, ea);
477
478 // inplace xjia = 2 * xija - xjia
479 AGF2sum_inplace(xa_j, xa_i, nmo*nvir, fpos, fneg);
480
481 // vv_xy += xija * (2 * yija - yjia)
482 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nvir, &D1, xa_j, &nvir, xa_i, &nvir, &D1, vv_priv, &nmo);
483
484 // inplace xija = eija * xija
485 AGF2prod_inplace_ener(ea, xa_i, nmo, nvir);
486
487 // vv_xy += xija * (2 * yija - yjia)
488 dgemm_(&TRANS_T, &TRANS_N, &nmo, &nmo, &nvir, &D1, xa_j, &nvir, xa_i, &nvir, &D1, vev_priv, &nmo);
489 }
490
491 free(qx_i);
492 free(qx_j);
493 free(qa_i);
494 free(qa_j);
495 free(xa_i);
496 free(xa_j);
497 free(ea);
498
499 #pragma omp critical
500 for (i = 0; i < (nmo*nmo); i++) {
501 vv[i] += vv_priv[i];
502 vev[i] += vev_priv[i];
503 }
504
505 free(vv_priv);
506 free(vev_priv);
507 }
508 }
509