1 /*-*-mode: C; c-indentation-style: "bsd"; c-basic-offset: 4; -*-*/
2 /* The linearly-scaling quadratic response XC evaluator.
3 (c) Pawel Salek, pawsa@theochem.kth.se.
4 2004.05.26 - 2005.03
5 */
6
7 #define _XOPEN_SOURCE 500
8 #define _XOPEN_SOURCE_EXTENDED 1
9
10 #include <assert.h>
11 #include <math.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <sys/times.h>
17 #include <unistd.h>
18
19 #define __CVERSION__
20 #include "general.h"
21 #include "integrator.h"
22 #include "functionals.h"
23
24 #include "inforb.h"
25
26 const static integer DFTQR_DEBUG = 0;
27
28 void FSYM(deq27)(const real* cmo, const real* ubo, const real* dv,
29 real* dxcao, real* dxvao, real* wrk, integer* lfrsav);
30
31 typedef struct {
32 real *res_omega, *res_omY, *res_omZ;
33 real *my, *mz, *myzzy; /* matrices we compute expectation values of *
34 * my is [rho,Y], etc. */
35 real *yy, *zz, *yzzy; /* vectors of expectation values */
36 real *vy, *vz; /* transformed orbitals */
37 integer spinY, symY;
38 integer spinZ, symZ;
39 integer symYZ; /* symmetry of the result = symA */
40 unsigned is_gga:1; /* whether we should do the density-gradient
41 * dependent part of the transformation */
42 } QuadBlData;
43
44 static const real MONER = -1.0;
45 static const real HALFR = 0.5;
46 /* [D, X] =
47 * [ A B ] [ 0 B ]
48 * X = [ C D ] -> [D, X] = [-C 0 ]
49 * where the blocks sizes are determined by the occupied/virtual split.
50 */
51 static void
commute_d_x(real * RESTRICT kappaY,integer symY,real * RESTRICT commuted_mat)52 commute_d_x(real * RESTRICT kappaY, integer symY,
53 real * RESTRICT commuted_mat)
54 {
55 integer isym, i, j;
56 integer norbt = inforb_.norbt;
57 #if 0
58 integer nocc = inforb_.nocct;
59 /* occupied columns */
60 for(i=0; i<nocc; i++) {
61 for(j=0; j<nocc; j++) commuted_mat[j+i*norbt] = 0;
62 for(j=nocc; j<norbt; j++) commuted_mat[j+i*norbt] = -kappaY[j+i*norbt];
63 }
64 /* virtual columns */
65 for(i=nocc; i<norbt; i++) {
66 for(j=0; j<nocc; j++) commuted_mat[j+i*norbt] = kappaY[j+i*norbt];
67 for(j=nocc; j<norbt; j++) commuted_mat[j+i*norbt] = 0;
68 }
69 #else
70 memset(commuted_mat, 0, norbt*norbt*sizeof(real));
71 for(isym=0; isym<inforb_.nsym; isym++) {
72 /* the block in question corresponds to (isym,jsym) block */
73 integer istart= inforb_.iorb[isym];
74 integer iocc = inforb_.nocc[isym];
75 integer iorb = inforb_.norb[isym];
76 integer jsym = inforb_.muld2h[symY-1][isym]-1;
77 integer jstart= inforb_.iorb[jsym];
78 integer jocc = inforb_.nocc[jsym];
79 integer jorb = inforb_.norb[jsym];
80 real * RESTRICT res = commuted_mat + istart + jstart*norbt;
81 real * RESTRICT ky = kappaY + istart + jstart*norbt;
82 /* occupied columns */
83 for(j=0; j<jocc; j++) {
84 for(i=0; i<iocc; i++) res[i+j*norbt] = 0;
85 for(i=iocc; i<iorb; i++) res[i+j*norbt] = -ky[i+j*norbt];
86 }
87 /* virtual columns */
88 for(j=jocc; j<jorb; j++) {
89 for(i=0; i<iocc; i++) res[i+j*norbt] = ky[i+j*norbt];
90 for(i=iocc; i<iorb; i++) res[i+j*norbt] = 0;
91 }
92 }
93 #endif
94 }
95
96 static __inline__ void
commute_matrices(integer sz,const real * a,const real * b,real * c,integer addp)97 commute_matrices(integer sz, const real* a, const real* b,
98 real* c, integer addp)
99 {
100 static const real MONER = -1.0;
101 const real* firstpref = addp ? &ONER : &ZEROR;
102 /* this could be optimized to use symmetry... */
103 dgemm_("N", "N", &sz, &sz, &sz, &ONER,
104 a, &sz, b, &sz, firstpref, c, &sz);
105 dgemm_("N", "N", &sz, &sz, &sz, &MONER,
106 b, &sz, a, &sz, &ONER, c, &sz);
107 }
108
109 integer FSYM(isetksymop)(const integer *new_ksymop);
110 static void
qrbl_data_init(QuadBlData * d,real * cmo,integer is_gga,integer max_block_len,real * kappaY,integer symY,integer spinY,real * kappaZ,integer symZ,integer spinZ,real * dmat,real * work,integer * lwork)111 qrbl_data_init(QuadBlData *d, real *cmo, integer is_gga, integer max_block_len,
112 real *kappaY, integer symY, integer spinY,
113 real *kappaZ, integer symZ, integer spinZ,
114 real *dmat,
115 real *work, integer *lwork)
116 {
117 static const real DP5R = 0.5;
118 real dummy;
119 integer isym;
120 integer nbast = inforb_.nbast;
121 integer norbt = inforb_.norbt;
122 real *commuted_mat = work;
123 real *work1 = commuted_mat + inforb_.n2orbx;
124 real *work2 = work1 + inforb_.n2basx;
125 integer allocated = inforb_.n2orbx + 2*inforb_.n2basx;
126 integer comps = is_gga ? 4 : 1; /* number of components */
127
128 d->res_omega = calloc(inforb_.n2basx,sizeof(real));
129 d->res_omY = calloc(inforb_.n2basx,sizeof(real));
130 d->res_omZ = calloc(inforb_.n2basx,sizeof(real));
131 d->my = dal_malloc(inforb_.n2basx*sizeof(real));
132 d->mz = dal_malloc(inforb_.n2basx*sizeof(real));
133 d->myzzy = dal_malloc(inforb_.n2basx*sizeof(real));
134 d->yy = dal_malloc(comps*max_block_len*sizeof(real));
135 d->zz = dal_malloc(comps*max_block_len*sizeof(real));
136 d->vy = dal_malloc(comps*max_block_len*nbast*sizeof(real));
137 d->vz = dal_malloc(comps*max_block_len*nbast*sizeof(real));
138 d->yzzy = dal_malloc(comps*max_block_len*sizeof(real));
139 d->spinY = spinY; d->symY = symY;
140 d->spinZ = spinZ; d->symZ = symZ;
141 d->symYZ = inforb_.muld2h[symY-1][symZ-1];
142
143 d->is_gga = is_gga != 0; /* make sure lowest bit is set */
144 if(*lwork<allocated)
145 dalton_quit("no enough mem in %s", __FUNCTION__);
146 isym = isetksymop_(&symY);
147 FSYM(deq27)(cmo, kappaY, &dummy, d->my, &dummy, work, lwork);
148 dscal_(&inforb_.n2basx,&DP5R,d->my,&ONEI);
149 isetksymop_(&symZ);
150 FSYM(deq27)(cmo, kappaZ, &dummy, d->mz, &dummy, work, lwork);
151 dscal_(&inforb_.n2basx,&DP5R,d->mz,&ONEI);
152 isetksymop_(&isym);
153 #if 0
154 fort_print("cmo");
155 outmat_(cmo,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
156 &inforb_.nbast, &inforb_.nbast);
157 fort_print("dmat");
158 outmat_(dmat,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
159 &inforb_.nbast, &inforb_.nbast);
160 fort_print("input Y");
161 outmat_(kappaY,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
162 &inforb_.nbast, &inforb_.nbast);
163 fort_print("inptu Z");
164 outmat_(kappaZ,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
165 &inforb_.nbast, &inforb_.nbast);
166
167 fort_print("mY");
168 outmat_(d->my,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
169 &inforb_.nbast, &inforb_.nbast);
170 #endif
171 /* yzzy = c*(dmo*(y*z+z*y) - (y*dmo*z + z*dmo*y))*c' */
172 /* FIXME: There is also place for optimization since large blocks
173 * of these the partial results have 0 blocks and probably entire
174 * [Y,[Z,D]] could be done in one shot considerity sparsity of D
175 * in MO basis. */
176 commute_d_x(kappaY, symY, commuted_mat);
177 commute_matrices(norbt, commuted_mat, kappaZ, work1, 0);
178 commute_d_x(kappaZ, symZ, commuted_mat);
179 commute_matrices(norbt, commuted_mat, kappaY, work1, 1);
180 if(DFTQR_DEBUG) {
181 fort_print("mYZZY (MO), sym: %d", d->symYZ);
182 outmat_(work1,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
183 &inforb_.norbt, &inforb_.norbt);
184 }
185
186 /* transform work1 to AO basis, (isym,jsym) blocks... */
187 memset(d->myzzy, 0, nbast*nbast*sizeof(real));
188 for(isym=0; isym<inforb_.nsym; isym++) {
189 integer ibasi = inforb_.ibas[isym];
190 integer nbasi = inforb_.nbas[isym];
191 integer iorbi = inforb_.iorb[isym];
192 integer norbi = inforb_.norb[isym];
193 integer icmoi = inforb_.icmo[isym];
194 integer jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
195 integer ibasj = inforb_.ibas[jsym];
196 integer nbasj = inforb_.nbas[jsym];
197 integer iorbj = inforb_.iorb[jsym];
198 integer norbj = inforb_.norb[jsym];
199 integer icmoj = inforb_.icmo[jsym];
200 if(norbi == 0 || norbj == 0) continue;
201 dgemm_("N","N", &nbasi, &norbj, &norbi,
202 &HALFR, cmo + icmoi, &nbasi, work1 + iorbi + iorbj*norbt, &norbt,
203 &ZEROR, work2, &nbasi);
204 dgemm_("N","T", &nbasi, &nbasj, &norbj,
205 &ONER, work2, &nbasi, cmo + icmoj, &nbasj,
206 &ZEROR, d->myzzy + ibasi + ibasj*nbast, &nbast);
207 }
208 if(DFTQR_DEBUG) {
209 fort_print("mYZZY (AO), sym: %d", d->symYZ);
210 outmat_(d->myzzy,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
211 &inforb_.nbast, &inforb_.nbast);
212 }
213 }
214
215 static void
qrbl_data_free(QuadBlData * d)216 qrbl_data_free(QuadBlData *d)
217 {
218 free(d->res_omega); free(d->res_omY); free(d->res_omZ);
219 free(d->my); free(d->mz); free(d->myzzy);
220 free(d->yy); free(d->zz); free(d->yzzy);
221 free(d->vy); free(d->vz);
222 }
223
224 static void
qrbl_eval_rho_vars(DftIntegratorBl * grid,QuadBlData * data,real * tmp,integer bllen,integer blstart,integer blend)225 qrbl_eval_rho_vars(DftIntegratorBl* grid, QuadBlData *data,
226 real *tmp, integer bllen, integer blstart, integer blend)
227 {
228 /* compute vector of transformed densities vt */
229 FSYM2(getexp_blocked_lda)(&data->symY, data->my, grid->atv,
230 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
231 tmp, &bllen, data->yy);
232 FSYM2(getexp_blocked_lda)(&data->symZ, data->mz, grid->atv,
233 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
234 tmp, &bllen, data->zz);
235 FSYM2(getexp_blocked_lda)(&data->symYZ, data->myzzy, grid->atv,
236 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
237 tmp, &bllen, data->yzzy);
238 }
239
240
241 /* ===================================================================
242 * LDA-optimized code
243 * =================================================================== */
244 /* add_dft_contribution:
245 computes dft contribution to the quadratic response using precomputed
246 traces of the operators (trY, trZ, etc).
247 */
248 static __inline__ real
min(real a,real b)249 min(real a, real b)
250 { return a>b ? b : a; }
251 static void
qrbl_add_lda_contribution(DftIntegratorBl * grid,QuadBlData * d,real * tmp,integer bllen,integer blstart,integer blend)252 qrbl_add_lda_contribution(DftIntegratorBl* grid, QuadBlData* d,
253 real *tmp, integer bllen, integer blstart, integer blend)
254 {
255 static const real sgn[2]={1.0,-1.0};
256 integer i, j, k, isym, ibl, jbl;
257 real pref2b[DFT_BLLEN], pref2c[DFT_BLLEN], pref3[DFT_BLLEN];
258 real * RESTRICT aos = grid->atv;
259 integer sY = d->spinY, sZ = d->spinZ;
260 real *om = d->res_omega, *omY = d->res_omY, *omZ = d->res_omZ;
261 FunDensProp dp = {0};
262
263 for(j=blstart; j<blend; j++) {
264 ThirdDrv drvs; /* the functional derivatives */
265 real weight = grid->weight[grid->curr_point+j];
266 dp.rhoa = dp.rhob = 0.5*grid->r.rho[j];
267 dftpot2_(&drvs, weight, &dp, 0, sY != sZ);
268 /* FIXME: simplify this */
269 /* and first order contribution */
270 pref2b[j] = d->yy[j]*drvs.fRR[sY];
271 pref2c[j] = d->zz[j]*drvs.fRR[sZ];
272
273 /* third order, and part of second order contribution */
274 pref3[j] =
275 d->yy[j]*d->zz[j]*drvs.fRRR[sY|sZ] +
276 d->yzzy[j] *drvs.fRR[sY^sZ];
277 }
278
279 for(isym=0; isym<grid->nsym; isym++) {
280 integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
281 integer ibl_cnt = grid->bas_bl_cnt[isym];
282
283 for(ibl=0; ibl<ibl_cnt; ibl++)
284 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
285 integer ioff = i*bllen;
286 for(k=blstart; k<blend; k++) {
287 d->vz[k+ioff] = -pref2b[k]*aos[k+ioff];
288 d->vy[k+ioff] = -pref2c[k]*aos[k+ioff];
289 tmp[k+ioff] = pref3[k]*aos[k+ioff];
290 }
291 }
292
293 /* Compute contributions to om, omY and omZ. We could have
294 * special case when all these three matrices have same
295 * symmetry but we do not bother since this code is most
296 * likely not going to be used a lot and it is best to keep it
297 * simple. */
298 for(ibl=0; ibl<ibl_cnt; ibl++) {
299 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
300 integer jsym, jbl_cnt, ioff = i*inforb_.nbast;
301 real * RESTRICT tmpi = tmp + i*bllen;
302 real * RESTRICT vyi = d->vy + i*bllen;
303 real * RESTRICT vzi = d->vz + i*bllen;
304 integer (*RESTRICT jblocks)[2];
305
306 jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
307 jblocks = BASBLOCK(grid,jsym);
308 jbl_cnt = grid->bas_bl_cnt[jsym];
309 for(jbl=0; jbl<jbl_cnt; jbl++) {
310 integer jtop = min(jblocks[jbl][1],i+1);
311 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
312 real * RESTRICT aosj = aos + j*bllen;
313 real s = 0;
314 for(k=blstart; k<blend; k++) s += aosj[k]*tmpi[k];
315 om[j+ioff] += s;
316 }
317 }
318 jsym = inforb_.muld2h[d->symZ-1][isym]-1;
319 jblocks = BASBLOCK(grid,jsym);
320 jbl_cnt = grid->bas_bl_cnt[jsym];
321 for(jbl=0; jbl<jbl_cnt; jbl++) {
322 integer jtop = min(jblocks[jbl][1],i+1);
323 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
324 real * RESTRICT aosj = aos + j*bllen;
325 real s = 0;
326 for(k=blstart; k<blend; k++) s += aosj[k]*vyi[k];
327 omY[j+ioff] += s;
328 }
329 }
330 jsym = inforb_.muld2h[d->symY-1][isym]-1;
331 jblocks = BASBLOCK(grid,jsym);
332 jbl_cnt = grid->bas_bl_cnt[jsym];
333 for(jbl=0; jbl<jbl_cnt; jbl++) {
334 integer jtop = min(jblocks[jbl][1],i+1);
335 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
336 real * RESTRICT aosj = aos + j*bllen;
337 real s = 0;
338 for(k=blstart; k<blend; k++) s += aosj[k]*vzi[k];
339 omZ[j+ioff] += s;
340 }
341 }
342 /* Start next column i... */
343 }
344 }
345 }
346 }
347 static void
qrbl_lda_cb(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,QuadBlData * data)348 qrbl_lda_cb(DftIntegratorBl* grid, real * RESTRICT tmp,
349 integer bllen, integer blstart, integer blend,
350 QuadBlData* data)
351 {
352 qrbl_eval_rho_vars (grid, data, tmp, bllen, blstart, blend);
353 qrbl_add_lda_contribution(grid, data, tmp, bllen, blstart, blend);
354 }
355
356 /* ===================================================================
357 * GGA-capable code
358 * =================================================================== */
359 static void
qrbl_eval_gga_vars(DftIntegratorBl * grid,QuadBlData * data,real * tmp,integer bllen,integer blstart,integer blend)360 qrbl_eval_gga_vars(DftIntegratorBl* grid, QuadBlData *data,
361 real *tmp, integer bllen, integer blstart, integer blend)
362 {
363 /* compute vector of transformed densities and density gradients vt */
364 FSYM2(getexp_blocked_gga)(&data->symY, data->my, grid->atv,
365 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
366 tmp, &bllen, (double(*)[4])data->yy);
367 FSYM2(getexp_blocked_gga)(&data->symZ, data->mz, grid->atv,
368 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
369 tmp, &bllen, (double(*)[4])data->zz);
370 FSYM2(getexp_blocked_gga)(&data->symYZ, data->myzzy, grid->atv,
371 grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
372 tmp, &bllen, (double(*)[4])data->yzzy);
373 }
374
375 static void
qrbl_add_gga_contribution(DftIntegratorBl * grid,QuadBlData * d,real * tmp,integer bllen,integer blstart,integer blend)376 qrbl_add_gga_contribution(DftIntegratorBl* grid, QuadBlData* d,
377 real *tmp, integer bllen, integer blstart, integer blend)
378 {
379 static const real sgn[2]={1.0,-1.0};
380 integer i, j, k, isym, ibl, jbl;
381 /* pref3 is the prefactor of the double-commuted term, pref2a is
382 * the prefactor of commuted with Z, and pref2b - with commuted
383 * with Y. */
384 real pref2b[DFT_BLLEN][4], pref2c[DFT_BLLEN][4], pref3[DFT_BLLEN][4];
385 real * RESTRICT aos = grid->atv;
386 real * RESTRICT aox = grid->atv+bllen*inforb_.nbast;
387 real * RESTRICT aoy = grid->atv+bllen*inforb_.nbast*2;
388 real * RESTRICT aoz = grid->atv+bllen*inforb_.nbast*3;
389 integer sY = d->spinY, sZ = d->spinZ;
390 real *om = d->res_omega, *omY = d->res_omY, *omZ = d->res_omZ;
391 FunDensProp dp = {0};
392 real (*trY)[4] = (real (*)[4])d->yy;
393 real (*trZ)[4] = (real (*)[4])d->zz;
394 real (*yzzy)[4] = (real (*)[4])d->yzzy;
395 real (*grad)[3] = grid->g.grad;
396 integer can_collapse_loops =
397 (d->symY == d ->symZ) && (d->symY == d ->symYZ);
398
399 for(k=blstart; k<blend; k++) {
400 real trYsum, trZsum, trYZZYsum, trYtimesZ, a, b, c, d;
401 ThirdDrv drvs; /* the functional derivatives */
402 real weight = grid->weight[grid->curr_point+k];
403 real ngrad = sqrt(grad[k][0]*grad[k][0]+
404 grad[k][1]*grad[k][1]+
405 grad[k][2]*grad[k][2]);
406 dp.rhoa = dp.rhob = 0.5*grid->r.rho[k];
407 dp.grada = dp.gradb = 0.5*ngrad;
408 dp.gradab = dp.grada*dp.gradb;
409 dftpot2_(&drvs, weight, &dp, 1, sY != sZ);
410 trYsum = 0.5*(trY[k][1] * grad[k][0] + trY[k][2] * grad[k][1]
411 + trY[k][3] * grad[k][2]);
412 trZsum = 0.5*(trZ[k][1] * grad[k][0] + trZ[k][2] * grad[k][1]
413 + trZ[k][3] * grad[k][2]);
414 trYZZYsum = 0.5*(yzzy[k][1] * grad[k][0] + yzzy[k][2] * grad[k][1]
415 + yzzy[k][3] * grad[k][2]);
416 trYtimesZ = trY[k][1]*trZ[k][1] + trY[k][2]*trZ[k][2]
417 + trY[k][3]*trZ[k][3];
418 /* FIXME: simplify this */
419 /* and first order contribution */
420
421 /* third order, and part of second order contribution */
422 pref3[k][0] = trY[k][0]*trZ[k][0]*drvs.fRRR[sY|sZ]+
423 yzzy[k][0]*drvs.fRR[sY^sZ] +
424 2*(drvs.fRRZ[sZ][sY]*trY[k][0]*trZsum +
425 drvs.fRRZ[sY][sZ]*trZ[k][0]*trYsum) +
426 4*trZsum*trYsum*drvs.fRZZ[sY^sZ][sY] +
427 2*(trYtimesZ + trYZZYsum)*drvs.fRZ[sY^sZ];
428
429 pref3[k][0] += drvs.fRRG[sY]*trY[k][0]*trZsum*(1+sgn[sZ])+
430 drvs.fRRG[sZ]*trZ[k][0]*trYsum*(1+sgn[sY])+
431 0.5*drvs.fRG[0]*(1+sgn[sY]*sgn[sZ])*trYZZYsum +
432 0.5*drvs.fRG[0]*(sgn[sY]+sgn[sZ])*trYtimesZ;
433
434 pref2b[k][0] = trY[k][0]*drvs.fRR[sY] + 2*trYsum*drvs.fRZ[sY]
435 + trYsum*drvs.fRG[sY];
436
437 pref2c[k][0] = trZ[k][0]*drvs.fRR[sZ] + 2*trZsum*drvs.fRZ[sZ]
438 + trZsum*drvs.fRG[sZ];
439
440
441 a = trY[k][0]*(2*drvs.fRZ[sY]+drvs.fRG[sY])+4*trYsum*drvs.fZZ[sY];
442 b = 2*drvs.fZ + sgn[sY]*drvs.fG;
443 b *= 2;
444
445 pref2b[k][1] = a*grad[k][0] + b*trY[k][1];
446 pref2b[k][2] = a*grad[k][1] + b*trY[k][2];
447 pref2b[k][3] = a*grad[k][2] + b*trY[k][3];
448
449
450 a = trZ[k][0]*(2*drvs.fRZ[sZ]+drvs.fRG[sZ])+4*trZsum*drvs.fZZ[sZ];
451 b = 2*drvs.fZ + sgn[sZ]*drvs.fG;
452 b *= 2;
453
454 pref2c[k][1] = a*grad[k][0] + b*trZ[k][1];
455 pref2c[k][2] = a*grad[k][1] + b*trZ[k][2];
456 pref2c[k][3] = a*grad[k][2] + b*trZ[k][3];
457
458 /* xyz components of pref3 */
459 a = (8*drvs.fZZZ[sY|sZ]*trYsum*trZsum +
460 4*(drvs.fRZZ[sY][sZ]*trY[k][0]*trZsum +
461 drvs.fRZZ[sZ][sY]*trZ[k][0]*trYsum)+
462 2*drvs.fRRZ[sY^sZ][sY]*trY[k][0]*trZ[k][0]);
463 a += drvs.fRRGX[sY][sZ]*trY[k][0]*trZ[k][0]
464 + 4*drvs.fZZ[sY^sZ]*trYZZYsum
465 + 4*drvs.fZZ[sY^sZ]*trYtimesZ
466 + 2*drvs.fRZ[sY^sZ]*yzzy[k][0];
467 a += drvs.fRG[sY^sZ]*yzzy[k][0];
468
469 b = 4*drvs.fZZ[sY]*trYsum + 2*drvs.fRZ[sY]*trY[k][0]
470 + drvs.fRG[sY]*trY[k][0]*sgn[sZ];
471 c = 4*drvs.fZZ[sZ]*trZsum + 2*drvs.fRZ[sZ]*trZ[k][0]
472 + drvs.fRG[sZ]*trZ[k][0]*sgn[sY];
473 d = 2*drvs.fZ + sgn[sY]*sgn[sZ]*drvs.fG;
474 b *= 2; c *= 2; d *= 2;
475
476 pref3[k][1] = a*grad[k][0] + b*trZ[k][1] + c*trY[k][1] + d*yzzy[k][1];
477 pref3[k][2] = a*grad[k][1] + b*trZ[k][2] + c*trY[k][2] + d*yzzy[k][2];
478 pref3[k][3] = a*grad[k][2] + b*trZ[k][3] + c*trY[k][3] + d*yzzy[k][3];
479 }
480
481 for(isym=0; isym<grid->nsym; isym++) {
482 integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
483 integer ibl_cnt = grid->bas_bl_cnt[isym];
484 for(ibl=0; ibl<ibl_cnt; ibl++)
485 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
486 real * RESTRICT a0 = aos + i*bllen;
487 real * RESTRICT ax = aox + i*bllen;
488 real * RESTRICT ay = aoy + i*bllen;
489 real * RESTRICT az = aoz + i*bllen;
490 real * RESTRICT vyi = d->vy + i*bllen;
491 real * RESTRICT vzi = d->vz + i*bllen;
492 real * RESTRICT tmi = tmp + i*bllen;
493 for(k=blstart; k<blend; k++) {
494 vzi[k] = -(pref2b[k][0]*a0[k] + pref2b[k][1]*ax[k] +
495 pref2b[k][2]*ay[k] + pref2b[k][3]*az[k]);
496 vyi[k] = -(pref2c[k][0]*a0[k] + pref2c[k][1]*ax[k] +
497 pref2c[k][2]*ay[k] + pref2c[k][3]*az[k]);
498 tmi[k] = (pref3 [k][0]*a0[k] + pref3 [k][1]*ax[k] +
499 pref3 [k][2]*ay[k] + pref3 [k][3]*az[k]);
500 }
501 }
502 /* Time to distribute computed prefactors. In principle, we
503 should have three separate loops because three generated
504 matrices may have different symmetries. On the other hand, it
505 is profitable to detect the commonly occuring case of same
506 symmetry and collapse the loops to reduce the overhead.
507 1:special case when all three vectors have same symmetry */
508 if(can_collapse_loops) {
509 integer nbast = inforb_.nbast;
510 integer jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
511 integer jbl_cnt = grid->bas_bl_cnt[jsym];
512 for(ibl=0; ibl<ibl_cnt; ibl++) {
513 for(jbl=0; jbl<jbl_cnt; jbl++) {
514 integer lenbl = blend - blstart;
515 integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
516 integer ilen = iblocks[ibl][1] - iblocks[ibl][0] + 1;
517 integer jlen = jblocks[jbl][1] - jblocks[jbl][0] + 1;
518
519 dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
520 aos + (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
521 tmp + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
522 &ONER, om + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
523
524 dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
525 aos + (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
526 d->vy + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
527 &ONER, omY + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
528
529 dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
530 aos + (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
531 d->vz + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
532 &ONER, omZ + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
533 }
534 }
535 continue; /* no need to do the general case here */
536 }
537 /* 2:general case */
538 for(ibl=0; ibl<ibl_cnt; ibl++) {
539 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
540 integer ioff = i*inforb_.nbast;
541 real sum;
542 real * RESTRICT tmpi = tmp + i*bllen;
543 real * RESTRICT vyi = d->vy + i*bllen;
544 real * RESTRICT vzi = d->vz + i*bllen;
545 integer jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
546 integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
547 integer jbl_cnt = grid->bas_bl_cnt[jsym];
548 for(jbl=0; jbl<jbl_cnt; jbl++) {
549 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
550 real * RESTRICT aosj = aos + j*bllen;
551 sum = 0;
552 for(k=blstart; k<blend; k++) sum += aosj[k]*tmpi[k];
553 om [j+ioff] += sum;
554 }
555 }
556 jsym = inforb_.muld2h[d->symZ-1][isym]-1;
557 jblocks = BASBLOCK(grid,jsym);
558 jbl_cnt = grid->bas_bl_cnt[jsym];
559 for(jbl=0; jbl<jbl_cnt; jbl++) {
560 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
561 real * RESTRICT aosj = aos + j*bllen;
562 sum = 0;
563 for(k=blstart; k<blend; k++) sum += aosj[k]*vyi[k];
564 omY[j+ioff] += sum;
565 }
566 }
567 jsym = inforb_.muld2h[d->symY-1][isym]-1;
568 jblocks = BASBLOCK(grid,jsym);
569 jbl_cnt = grid->bas_bl_cnt[jsym];
570 for(jbl=0; jbl<jbl_cnt; jbl++) {
571 for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
572 real * RESTRICT aosj = aos + j*bllen;
573 sum = 0;
574 for(k=blstart; k<blend; k++) sum += aosj[k]*vzi[k];
575 omZ[j+ioff] += sum;
576 }
577 }
578 }
579 }
580 }
581 }
582
583 static void
qrbl_gga_cb(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,QuadBlData * data)584 qrbl_gga_cb(DftIntegratorBl* grid, real * RESTRICT tmp,
585 integer bllen, integer blstart, integer blend,
586 QuadBlData* data)
587 {
588 qrbl_eval_gga_vars (grid, data, tmp, bllen, blstart, blend);
589 qrbl_add_gga_contribution(grid, data, tmp, bllen, blstart, blend);
590 }
591
592 /* ===================================================================
593 * Parallel section
594 * =================================================================== */
595
596 #if defined(VAR_MPI)
597 #include <mpi.h>
598 #include <our_extra_mpi.h>
599 #define MASTER_NO 0
600 /* dft_qr_faster_slave:
601 this is a slave driver. It's task is to allocate memory needed by
602 the main property evaluator (dftqrcf_ in this case) and call it.
603 */
604 void
dft_qrbl_slave(real * work,integer * lwork,integer * iprint)605 dft_qrbl_slave(real* work, integer* lwork, integer* iprint)
606 {
607 real* fi = malloc(inforb_.n2basx*sizeof(real)); /* OUT */
608 real *cmo = malloc(inforb_.norbt*inforb_.nbast*sizeof(real)); /* IN */
609 real *kappaY= malloc(inforb_.n2orbx*sizeof(real)); /* IN */
610 real *kappaZ= malloc(inforb_.n2orbx*sizeof(real)); /* IN */
611 integer addfock, symY, symZ, spinY, spinZ; /* IN */
612 FSYM2(dft_qr_respons)(fi, cmo, kappaY, &symY, &spinY, kappaZ, &symZ, &spinZ,
613 &addfock, work, lwork, iprint);
614 free(kappaZ);
615 free(kappaY);
616 free(cmo);
617 free(fi);
618 }
619
620 static void
qrbl_sync_slaves(real * cmo,real * kappaY,real * kappaZ,integer * addfock,integer * symY,integer * symZ,integer * spinY,integer * spinZ)621 qrbl_sync_slaves(real* cmo, real* kappaY, real* kappaZ, integer* addfock,
622 integer* symY, integer* symZ, integer* spinY, integer* spinZ)
623 {
624 static const SyncData sync_data[] = {
625 { &inforb_.nocc, 8, fortran_MPI_INT },
626 { &inforb_.nocct, 1, fortran_MPI_INT },
627 { &inforb_.nvirt, 1, fortran_MPI_INT },
628 };
629 #ifdef C99_COMPILER
630 const SyncData data2[] = {
631 { cmo, &inforb_.norbt*inforb_.nbast,MPI_DOUBLE },
632 { kappaY, &inforb_.n2orbx, MPI_DOUBLE },
633 { kappaZ, &inforb_.n2orbx, MPI_DOUBLE },
634 { addfock, 1, fortran_MPI_INT },
635 { symY, 1, fortran_MPI_INT },
636 { symZ, 1, fortran_MPI_INT },
637 { spinY, 1, fortran_MPI_INT },
638 { spinZ, 1, fortran_MPI_INT }
639 };
640 #else /* C99_COMPILER */
641 /* this is more error-prone but some compilers (HP/UX)... */
642 static SyncData data2[] =
643 { {NULL, 0, MPI_DOUBLE}, {NULL, 0, MPI_DOUBLE}, {NULL, 0, MPI_DOUBLE},
644 {NULL, 0, fortran_MPI_INT }, {NULL, 0, fortran_MPI_INT }, {NULL, 0, fortran_MPI_INT },
645 {NULL, 0, fortran_MPI_INT }, {NULL, 0, fortran_MPI_INT } };
646 data2[0].data = cmo; data2[0].count = inforb_.norbt*inforb_.nbast;
647 data2[1].data = kappaY; data2[1].count = inforb_.n2orbx;
648 data2[2].data = kappaZ; data2[2].count = inforb_.n2orbx;
649 data2[3].data = addfock; data2[3].count = 1;
650 data2[4].data = symY; data2[4].count = 1;
651 data2[5].data = symZ; data2[5].count = 1;
652 data2[6].data = spinY; data2[6].count = 1;
653 data2[7].data = spinZ; data2[7].count = 1;
654 #endif /* C99_COMPILER */
655
656 mpi_sync_data(sync_data, ELEMENTS(sync_data));
657 mpi_sync_data(data2, ELEMENTS(data2));
658 }
659 static __inline__ void
qrbl_collect_info(real * fi,real * work,integer lwork)660 qrbl_collect_info(real* fi, real*work, integer lwork)
661 {
662 int sz = 0;
663 MPI_Comm_size(MPI_COMM_WORLD, &sz);
664 if(sz<=1) return;
665
666 CHECK_WRKMEM(inforb_.n2orbx,lwork);
667 dcopy_(&inforb_.n2orbx, fi, &ONEI, work, &ONEI);
668 integer n2orbx_int = inforb_.n2orbx;
669 MPI_Reduce(work, fi, n2orbx_int, MPI_DOUBLE, MPI_SUM,
670 MASTER_NO, MPI_COMM_WORLD);
671 }
672 #else /* VAR_MPI */
673 #define qrbl_sync_slaves(cmo,kappaY,kappaZ,addfck,symY,symZ,spinY,spinZ)
674 #define qrbl_collect_info(fi,work,lwork)
675 #endif /* VAR_MPI */
676
677 /* ===================================================================
678 * Main driver
679 * =================================================================== */
680 void
FSYM2(dft_qr_respons)681 FSYM2(dft_qr_respons)(real *fi, real *cmo,
682 real *kappaY, integer *symY, integer *spinY,
683 real *kappaZ, integer *symZ, integer *spinZ,
684 integer *addfock, real *work, integer *lwork, integer *iprint)
685 {
686 static integer msg_printed = 0;
687 struct tms starttm, endtm; clock_t utm;
688 real electrons;
689 QuadBlData qr_data; /* quadratic response data */
690 integer i, j;
691 real *dmat;
692 real *tmp1, *tmp2;
693 DftBlockCallback cb;
694
695 /* WARNING: NO work MAY BE done before syncing slaves! */
696 dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_qr_respons)); /* NO-OP in serial */
697 qrbl_sync_slaves(cmo,kappaY,kappaZ,addfock,
698 symY,symZ, spinY,spinZ);
699 if(*addfock) {/* call the old, slow version here */
700 #ifdef VAR_MPI
701 fort_print("This was supposed not to be called.");
702 dalton_quit("This calculation not reported to work in parallel.");
703 #endif
704 dftqrcf_(fi, cmo, kappaY, symY, spinY,
705 kappaZ, symZ, spinZ, addfock,
706 work, lwork, iprint);
707 return;
708 }
709 dmat = dal_malloc(inforb_.n2basx*sizeof(real));
710 if(!msg_printed) {
711 fort_print("DFT-QR computed in a linearly-scaling fashion.\n");
712 msg_printed = 1;
713 }
714 times(&starttm);
715 FSYM2(dft_get_ao_dens_mat)(cmo, dmat, work, lwork);
716 qrbl_data_init(&qr_data, cmo, selected_func->is_gga(), DFT_BLLEN,
717 kappaY, *symY, *spinY,
718 kappaZ, *symZ, *spinZ,
719 dmat, work, lwork);
720 cb = (DftBlockCallback)
721 (qr_data.is_gga ? qrbl_gga_cb : qrbl_lda_cb );
722 electrons = dft_integrate_ao_bl(1, dmat, work, lwork, iprint, 0,
723 cb, &qr_data);
724 free(dmat);
725 if(DFTQR_DEBUG) {
726 fort_print("DFT quadratic contribution (AO)");
727 outmat_(qr_data.res_omega, &ONEI, &inforb_.nbast, &ONEI,
728 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
729 outmat_(qr_data.res_omY, &ONEI, &inforb_.nbast, &ONEI,
730 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
731 outmat_(qr_data.res_omZ, &ONEI, &inforb_.nbast, &ONEI,
732 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
733 }
734 /* LDA callback computes only half and GGA needs to be symmetrized */
735 for(i=0; i<inforb_.nbast; i++) {
736 for(j=0; j<i; j++) {
737 integer ji = j + i*inforb_.nbast;
738 integer ij = i + j*inforb_.nbast;
739 real avg = 0.5*(qr_data.res_omega[ij]+qr_data.res_omega[ji]);
740 qr_data.res_omega[ij] = qr_data.res_omega[ji] = avg;
741 avg = 0.5*(qr_data.res_omY[ij] + qr_data.res_omY[ji]);
742 qr_data.res_omY[ij] = qr_data.res_omY[ji] = avg;
743 avg = 0.5*(qr_data.res_omZ[ij] + qr_data.res_omZ[ji]);
744 qr_data.res_omZ[ij] = qr_data.res_omZ[ji] = avg;
745 }
746 }
747 /* Transform to MO Fock matrix contribution: fix the symmetry of
748 * the result! */
749 tmp1 = dal_malloc(inforb_.n2orbx*sizeof(real));
750 tmp2 = dal_malloc(inforb_.n2orbx*sizeof(real));
751 dzero_(tmp1, &inforb_.n2orbx);
752 FSYM(lrao2mo)(cmo, &qr_data.symYZ, qr_data.res_omega, tmp1, work, lwork);
753 /* compute commutators and add them */
754 dzero_(tmp2, &inforb_.n2orbx);
755 FSYM(lrao2mo)(cmo, &qr_data.symZ, qr_data.res_omY, tmp2, work, lwork);
756 commute_matrices(inforb_.norbt, tmp2, kappaY, tmp1, 1);
757 dzero_(tmp2, &inforb_.n2orbx);
758 FSYM(lrao2mo)(cmo, &qr_data.symY, qr_data.res_omZ, tmp2, work, lwork);
759 commute_matrices(inforb_.norbt, tmp2, kappaZ, tmp1, 1);
760 qrbl_collect_info(tmp1, work, *lwork); /* NO-OP in serial */
761 daxpy_(&inforb_.n2orbx, &HALFR, tmp1, &ONEI, fi, &ONEI);
762 if(DFTQR_DEBUG) {
763 fort_print("DFT quadratic contribution (MO)");
764 outmat_(fi, &ONEI, &inforb_.norbt, &ONEI, &inforb_.norbt,
765 &inforb_.norbt, &inforb_.norbt);
766 }
767 free(tmp1); free(tmp2);
768 qrbl_data_free(&qr_data);
769 if (*iprint>0) {
770 times(&endtm);
771 utm = endtm.tms_utime-starttm.tms_utime;
772 fort_print(" Electrons: %f(%9.3g): QR-DFT/b evaluation time: %9.1f s",
773 electrons, (double)(electrons-(integer)(electrons+0.5)),
774 utm/(double)sysconf(_SC_CLK_TCK));
775 }
776 }
777