1 /*-*-mode: C; c-indentation-style: "bsd"; c-basic-offset: 4; -*-*/
2 /* The linearly-scaling quadratic response XC evaluator.
3    (c) Pawel Salek, pawsa@theochem.kth.se.
4    2004.05.26 - 2005.03
5 */
6 
7 #define _XOPEN_SOURCE          500
8 #define _XOPEN_SOURCE_EXTENDED 1
9 
10 #include <assert.h>
11 #include <math.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <sys/times.h>
17 #include <unistd.h>
18 
19 #define __CVERSION__
20 #include "general.h"
21 #include "integrator.h"
22 #include "functionals.h"
23 
24 #include "inforb.h"
25 
26 const static integer DFTQR_DEBUG = 0;
27 
28 void FSYM(deq27)(const real* cmo, const real* ubo, const real* dv,
29                  real* dxcao, real* dxvao, real* wrk, integer* lfrsav);
30 
31 typedef struct {
32     real *res_omega, *res_omY, *res_omZ;
33     real *my, *mz, *myzzy; /* matrices we compute expectation values of *
34                             * my is [rho,Y], etc. */
35     real *yy, *zz, *yzzy;  /* vectors of expectation values */
36     real *vy, *vz;         /* transformed orbitals */
37     integer spinY, symY;
38     integer spinZ, symZ;
39     integer symYZ; /* symmetry of the result = symA */
40     unsigned is_gga:1; /* whether we should do the density-gradient
41                         * dependent part of the transformation */
42 } QuadBlData;
43 
44 static const real MONER = -1.0;
45 static const real HALFR =  0.5;
46 /* [D, X] =
47  *     [ A   B ]             [ 0 B ]
48  * X = [ C   D ] -> [D, X] = [-C 0 ]
49  * where the blocks sizes are determined by the occupied/virtual split.
50  */
51 static void
commute_d_x(real * RESTRICT kappaY,integer symY,real * RESTRICT commuted_mat)52 commute_d_x(real * RESTRICT kappaY, integer symY,
53             real * RESTRICT commuted_mat)
54 {
55     integer isym, i, j;
56     integer norbt = inforb_.norbt;
57 #if 0
58     integer nocc = inforb_.nocct;
59     /* occupied columns */
60     for(i=0; i<nocc; i++) {
61         for(j=0; j<nocc; j++)     commuted_mat[j+i*norbt] = 0;
62         for(j=nocc; j<norbt; j++) commuted_mat[j+i*norbt] = -kappaY[j+i*norbt];
63     }
64     /* virtual columns */
65     for(i=nocc; i<norbt; i++) {
66         for(j=0; j<nocc; j++)     commuted_mat[j+i*norbt] = kappaY[j+i*norbt];
67         for(j=nocc; j<norbt; j++) commuted_mat[j+i*norbt] = 0;
68     }
69 #else
70     memset(commuted_mat, 0, norbt*norbt*sizeof(real));
71     for(isym=0; isym<inforb_.nsym; isym++) {
72         /* the block in question corresponds to (isym,jsym) block */
73         integer istart= inforb_.iorb[isym];
74         integer iocc  = inforb_.nocc[isym];
75         integer iorb  = inforb_.norb[isym];
76         integer jsym  = inforb_.muld2h[symY-1][isym]-1;
77         integer jstart= inforb_.iorb[jsym];
78         integer jocc  = inforb_.nocc[jsym];
79         integer jorb  = inforb_.norb[jsym];
80         real * RESTRICT res = commuted_mat + istart + jstart*norbt;
81         real * RESTRICT ky  = kappaY       + istart + jstart*norbt;
82         /* occupied columns */
83         for(j=0; j<jocc; j++) {
84             for(i=0;    i<iocc; i++) res[i+j*norbt] = 0;
85             for(i=iocc; i<iorb; i++) res[i+j*norbt] = -ky[i+j*norbt];
86         }
87         /* virtual columns */
88         for(j=jocc; j<jorb; j++) {
89             for(i=0;    i<iocc; i++) res[i+j*norbt] = ky[i+j*norbt];
90             for(i=iocc; i<iorb; i++) res[i+j*norbt] = 0;
91         }
92     }
93 #endif
94 }
95 
96 static __inline__ void
commute_matrices(integer sz,const real * a,const real * b,real * c,integer addp)97 commute_matrices(integer sz, const real* a, const real* b,
98                  real* c, integer addp)
99 {
100     static const real MONER = -1.0;
101     const real* firstpref = addp ? &ONER : &ZEROR;
102     /* this could be optimized to use symmetry... */
103     dgemm_("N", "N", &sz, &sz, &sz, &ONER,
104 	   a, &sz, b, &sz, firstpref, c, &sz);
105     dgemm_("N", "N", &sz, &sz, &sz, &MONER,
106            b, &sz, a, &sz, &ONER, c, &sz);
107 }
108 
109 integer FSYM(isetksymop)(const integer *new_ksymop);
110 static void
qrbl_data_init(QuadBlData * d,real * cmo,integer is_gga,integer max_block_len,real * kappaY,integer symY,integer spinY,real * kappaZ,integer symZ,integer spinZ,real * dmat,real * work,integer * lwork)111 qrbl_data_init(QuadBlData *d, real *cmo, integer is_gga, integer max_block_len,
112 	      real *kappaY, integer symY, integer spinY,
113 	      real *kappaZ, integer symZ, integer spinZ,
114 	      real *dmat,
115 	      real *work, integer *lwork)
116 {
117     static const real DP5R  =  0.5;
118     real dummy;
119     integer isym;
120     integer nbast = inforb_.nbast;
121     integer norbt = inforb_.norbt;
122     real *commuted_mat = work;
123     real *work1 = commuted_mat + inforb_.n2orbx;
124     real *work2 = work1        + inforb_.n2basx;
125     integer allocated = inforb_.n2orbx + 2*inforb_.n2basx;
126     integer comps = is_gga ?  4 : 1; /* number of components */
127 
128     d->res_omega = calloc(inforb_.n2basx,sizeof(real));
129     d->res_omY   = calloc(inforb_.n2basx,sizeof(real));
130     d->res_omZ   = calloc(inforb_.n2basx,sizeof(real));
131     d->my    = dal_malloc(inforb_.n2basx*sizeof(real));
132     d->mz    = dal_malloc(inforb_.n2basx*sizeof(real));
133     d->myzzy = dal_malloc(inforb_.n2basx*sizeof(real));
134     d->yy   = dal_malloc(comps*max_block_len*sizeof(real));
135     d->zz   = dal_malloc(comps*max_block_len*sizeof(real));
136     d->vy   = dal_malloc(comps*max_block_len*nbast*sizeof(real));
137     d->vz   = dal_malloc(comps*max_block_len*nbast*sizeof(real));
138     d->yzzy = dal_malloc(comps*max_block_len*sizeof(real));
139     d->spinY = spinY; d->symY = symY;
140     d->spinZ = spinZ; d->symZ = symZ;
141     d->symYZ = inforb_.muld2h[symY-1][symZ-1];
142 
143     d->is_gga = is_gga != 0; /* make sure lowest bit is set */
144     if(*lwork<allocated)
145         dalton_quit("no enough mem in %s", __FUNCTION__);
146     isym = isetksymop_(&symY);
147     FSYM(deq27)(cmo, kappaY, &dummy, d->my,    &dummy, work, lwork);
148     dscal_(&inforb_.n2basx,&DP5R,d->my,&ONEI);
149     isetksymop_(&symZ);
150     FSYM(deq27)(cmo, kappaZ, &dummy, d->mz,    &dummy, work, lwork);
151     dscal_(&inforb_.n2basx,&DP5R,d->mz,&ONEI);
152     isetksymop_(&isym);
153 #if 0
154     fort_print("cmo");
155     outmat_(cmo,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
156             &inforb_.nbast, &inforb_.nbast);
157     fort_print("dmat");
158     outmat_(dmat,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
159             &inforb_.nbast, &inforb_.nbast);
160     fort_print("input Y");
161     outmat_(kappaY,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
162             &inforb_.nbast, &inforb_.nbast);
163     fort_print("inptu Z");
164     outmat_(kappaZ,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
165             &inforb_.nbast, &inforb_.nbast);
166 
167     fort_print("mY");
168     outmat_(d->my,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
169             &inforb_.nbast, &inforb_.nbast);
170 #endif
171     /* yzzy = c*(dmo*(y*z+z*y) - (y*dmo*z + z*dmo*y))*c' */
172     /* FIXME: There is also place for optimization since large blocks
173      * of these the partial results have 0 blocks and probably entire
174      * [Y,[Z,D]] could be done in one shot considerity sparsity of D
175      * in MO basis. */
176     commute_d_x(kappaY, symY, commuted_mat);
177     commute_matrices(norbt, commuted_mat, kappaZ, work1, 0);
178     commute_d_x(kappaZ, symZ, commuted_mat);
179     commute_matrices(norbt, commuted_mat, kappaY, work1, 1);
180     if(DFTQR_DEBUG) {
181         fort_print("mYZZY (MO), sym: %d", d->symYZ);
182         outmat_(work1,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
183                 &inforb_.norbt, &inforb_.norbt);
184     }
185 
186     /* transform work1 to AO basis, (isym,jsym) blocks... */
187     memset(d->myzzy, 0, nbast*nbast*sizeof(real));
188     for(isym=0; isym<inforb_.nsym; isym++) {
189         integer ibasi = inforb_.ibas[isym];
190         integer nbasi = inforb_.nbas[isym];
191         integer iorbi = inforb_.iorb[isym];
192         integer norbi = inforb_.norb[isym];
193         integer icmoi = inforb_.icmo[isym];
194         integer jsym  = inforb_.muld2h[d->symYZ-1][isym]-1;
195         integer ibasj = inforb_.ibas[jsym];
196         integer nbasj = inforb_.nbas[jsym];
197         integer iorbj = inforb_.iorb[jsym];
198         integer norbj = inforb_.norb[jsym];
199         integer icmoj = inforb_.icmo[jsym];
200         if(norbi == 0 || norbj == 0) continue;
201         dgemm_("N","N", &nbasi, &norbj, &norbi,
202                &HALFR, cmo + icmoi, &nbasi, work1 + iorbi + iorbj*norbt, &norbt,
203                &ZEROR, work2, &nbasi);
204         dgemm_("N","T", &nbasi, &nbasj, &norbj,
205                &ONER, work2, &nbasi, cmo + icmoj, &nbasj,
206                &ZEROR, d->myzzy + ibasi + ibasj*nbast, &nbast);
207     }
208     if(DFTQR_DEBUG) {
209         fort_print("mYZZY (AO), sym: %d", d->symYZ);
210         outmat_(d->myzzy,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
211                 &inforb_.nbast, &inforb_.nbast);
212     }
213 }
214 
215 static void
qrbl_data_free(QuadBlData * d)216 qrbl_data_free(QuadBlData *d)
217 {
218     free(d->res_omega); free(d->res_omY); free(d->res_omZ);
219     free(d->my); free(d->mz); free(d->myzzy);
220     free(d->yy); free(d->zz); free(d->yzzy);
221     free(d->vy); free(d->vz);
222 }
223 
224 static void
qrbl_eval_rho_vars(DftIntegratorBl * grid,QuadBlData * data,real * tmp,integer bllen,integer blstart,integer blend)225 qrbl_eval_rho_vars(DftIntegratorBl* grid, QuadBlData *data,
226 		  real *tmp, integer bllen, integer blstart, integer blend)
227 {
228     /* compute vector of transformed densities vt */
229     FSYM2(getexp_blocked_lda)(&data->symY, data->my, grid->atv,
230                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
231                         tmp, &bllen, data->yy);
232     FSYM2(getexp_blocked_lda)(&data->symZ, data->mz, grid->atv,
233                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
234                         tmp, &bllen, data->zz);
235     FSYM2(getexp_blocked_lda)(&data->symYZ, data->myzzy, grid->atv,
236                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
237                         tmp, &bllen, data->yzzy);
238 }
239 
240 
241 /* ===================================================================
242  *            LDA-optimized code
243  * =================================================================== */
244 /* add_dft_contribution:
245    computes dft contribution to the quadratic response using precomputed
246    traces of the operators (trY, trZ, etc).
247 */
248 static __inline__ real
min(real a,real b)249 min(real a, real b)
250 { return a>b ? b : a; }
251 static void
qrbl_add_lda_contribution(DftIntegratorBl * grid,QuadBlData * d,real * tmp,integer bllen,integer blstart,integer blend)252 qrbl_add_lda_contribution(DftIntegratorBl* grid, QuadBlData* d,
253                           real *tmp, integer bllen, integer blstart, integer blend)
254 {
255     static const real sgn[2]={1.0,-1.0};
256     integer i, j, k, isym, ibl, jbl;
257     real pref2b[DFT_BLLEN], pref2c[DFT_BLLEN], pref3[DFT_BLLEN];
258     real * RESTRICT aos = grid->atv;
259     integer sY = d->spinY, sZ = d->spinZ;
260     real *om = d->res_omega, *omY = d->res_omY, *omZ = d->res_omZ;
261     FunDensProp dp = {0};
262 
263     for(j=blstart; j<blend; j++) {
264         ThirdDrv    drvs; /* the functional derivatives */
265         real weight = grid->weight[grid->curr_point+j];
266         dp.rhoa = dp.rhob = 0.5*grid->r.rho[j];
267         dftpot2_(&drvs, weight, &dp, 0, sY != sZ);
268         /* FIXME: simplify this */
269         /* and first order contribution */
270         pref2b[j] = d->yy[j]*drvs.fRR[sY];
271         pref2c[j] = d->zz[j]*drvs.fRR[sZ];
272 
273         /* third order, and part of second order contribution */
274         pref3[j] =
275             d->yy[j]*d->zz[j]*drvs.fRRR[sY|sZ] +
276             d->yzzy[j]       *drvs.fRR[sY^sZ];
277     }
278 
279     for(isym=0; isym<grid->nsym; isym++) {
280         integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
281         integer ibl_cnt = grid->bas_bl_cnt[isym];
282 
283         for(ibl=0; ibl<ibl_cnt; ibl++)
284             for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
285                 integer ioff = i*bllen;
286                 for(k=blstart; k<blend; k++) {
287                     d->vz[k+ioff] = -pref2b[k]*aos[k+ioff];
288                     d->vy[k+ioff] = -pref2c[k]*aos[k+ioff];
289                     tmp[k+ioff]   =  pref3[k]*aos[k+ioff];
290                 }
291             }
292 
293         /* Compute contributions to om, omY and omZ. We could have
294          * special case when all these three matrices have same
295          * symmetry but we do not bother since this code is most
296          * likely not going to be used a lot and it is best to keep it
297          * simple. */
298         for(ibl=0; ibl<ibl_cnt; ibl++) {
299             for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
300                 integer jsym, jbl_cnt, ioff = i*inforb_.nbast;
301                 real * RESTRICT tmpi = tmp + i*bllen;
302                 real * RESTRICT vyi = d->vy + i*bllen;
303                 real * RESTRICT vzi = d->vz + i*bllen;
304                 integer (*RESTRICT jblocks)[2];
305 
306                 jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
307                 jblocks = BASBLOCK(grid,jsym);
308                 jbl_cnt = grid->bas_bl_cnt[jsym];
309                 for(jbl=0; jbl<jbl_cnt; jbl++) {
310                     integer jtop = min(jblocks[jbl][1],i+1);
311                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
312                         real * RESTRICT aosj = aos + j*bllen;
313                         real s = 0;
314                         for(k=blstart; k<blend; k++) s += aosj[k]*tmpi[k];
315                         om[j+ioff] += s;
316                     }
317                 }
318                 jsym = inforb_.muld2h[d->symZ-1][isym]-1;
319                 jblocks = BASBLOCK(grid,jsym);
320                 jbl_cnt = grid->bas_bl_cnt[jsym];
321                 for(jbl=0; jbl<jbl_cnt; jbl++) {
322                     integer jtop = min(jblocks[jbl][1],i+1);
323                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
324                         real * RESTRICT aosj = aos + j*bllen;
325                         real s = 0;
326                         for(k=blstart; k<blend; k++) s += aosj[k]*vyi[k];
327                         omY[j+ioff] += s;
328                     }
329                 }
330                 jsym = inforb_.muld2h[d->symY-1][isym]-1;
331                 jblocks = BASBLOCK(grid,jsym);
332                 jbl_cnt = grid->bas_bl_cnt[jsym];
333                 for(jbl=0; jbl<jbl_cnt; jbl++) {
334                     integer jtop = min(jblocks[jbl][1],i+1);
335                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
336                         real * RESTRICT aosj = aos + j*bllen;
337                         real s = 0;
338                         for(k=blstart; k<blend; k++) s += aosj[k]*vzi[k];
339                         omZ[j+ioff] += s;
340                     }
341                 }
342                 /* Start next column i... */
343             }
344         }
345     }
346 }
347 static void
qrbl_lda_cb(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,QuadBlData * data)348 qrbl_lda_cb(DftIntegratorBl* grid, real * RESTRICT tmp,
349 	    integer bllen, integer blstart, integer blend,
350 	    QuadBlData* data)
351 {
352     qrbl_eval_rho_vars       (grid, data, tmp, bllen, blstart, blend);
353     qrbl_add_lda_contribution(grid, data, tmp, bllen, blstart, blend);
354 }
355 
356 /* ===================================================================
357  *            GGA-capable code
358  * =================================================================== */
359 static void
qrbl_eval_gga_vars(DftIntegratorBl * grid,QuadBlData * data,real * tmp,integer bllen,integer blstart,integer blend)360 qrbl_eval_gga_vars(DftIntegratorBl* grid, QuadBlData *data,
361                     real *tmp, integer bllen, integer blstart, integer blend)
362 {
363     /* compute vector of transformed densities and density gradients vt */
364     FSYM2(getexp_blocked_gga)(&data->symY, data->my, grid->atv,
365                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
366                         tmp, &bllen, (double(*)[4])data->yy);
367     FSYM2(getexp_blocked_gga)(&data->symZ, data->mz, grid->atv,
368                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
369                         tmp, &bllen, (double(*)[4])data->zz);
370     FSYM2(getexp_blocked_gga)(&data->symYZ, data->myzzy, grid->atv,
371                         grid->bas_bl_cnt, grid->basblocks, &grid->shl_bl_cnt,
372                         tmp, &bllen, (double(*)[4])data->yzzy);
373 }
374 
375 static void
qrbl_add_gga_contribution(DftIntegratorBl * grid,QuadBlData * d,real * tmp,integer bllen,integer blstart,integer blend)376 qrbl_add_gga_contribution(DftIntegratorBl* grid, QuadBlData* d,
377                           real *tmp, integer bllen, integer blstart, integer blend)
378 {
379     static const real sgn[2]={1.0,-1.0};
380     integer i, j, k, isym, ibl, jbl;
381     /* pref3 is the prefactor of the double-commuted term, pref2a is
382      * the prefactor of commuted with Z, and pref2b - with commuted
383      * with Y. */
384     real pref2b[DFT_BLLEN][4], pref2c[DFT_BLLEN][4], pref3[DFT_BLLEN][4];
385     real * RESTRICT aos = grid->atv;
386     real * RESTRICT aox = grid->atv+bllen*inforb_.nbast;
387     real * RESTRICT aoy = grid->atv+bllen*inforb_.nbast*2;
388     real * RESTRICT aoz = grid->atv+bllen*inforb_.nbast*3;
389     integer sY = d->spinY, sZ = d->spinZ;
390     real *om = d->res_omega, *omY = d->res_omY, *omZ = d->res_omZ;
391     FunDensProp dp = {0};
392     real (*trY)[4]  = (real (*)[4])d->yy;
393     real (*trZ)[4]  = (real (*)[4])d->zz;
394     real (*yzzy)[4] = (real (*)[4])d->yzzy;
395     real (*grad)[3] = grid->g.grad;
396     integer can_collapse_loops =
397         (d->symY == d ->symZ) && (d->symY == d ->symYZ);
398 
399     for(k=blstart; k<blend; k++) {
400         real trYsum, trZsum, trYZZYsum, trYtimesZ, a, b, c, d;
401         ThirdDrv    drvs; /* the functional derivatives */
402         real weight = grid->weight[grid->curr_point+k];
403         real ngrad = sqrt(grad[k][0]*grad[k][0]+
404                           grad[k][1]*grad[k][1]+
405                           grad[k][2]*grad[k][2]);
406         dp.rhoa = dp.rhob = 0.5*grid->r.rho[k];
407         dp.grada = dp.gradb = 0.5*ngrad;
408         dp.gradab = dp.grada*dp.gradb;
409         dftpot2_(&drvs, weight, &dp, 1, sY != sZ);
410         trYsum = 0.5*(trY[k][1] * grad[k][0] + trY[k][2] * grad[k][1]
411             + trY[k][3] * grad[k][2]);
412         trZsum = 0.5*(trZ[k][1] * grad[k][0] + trZ[k][2] * grad[k][1]
413             + trZ[k][3] * grad[k][2]);
414         trYZZYsum = 0.5*(yzzy[k][1] * grad[k][0] + yzzy[k][2] * grad[k][1]
415                          + yzzy[k][3] * grad[k][2]);
416         trYtimesZ = trY[k][1]*trZ[k][1] + trY[k][2]*trZ[k][2]
417             + trY[k][3]*trZ[k][3];
418         /* FIXME: simplify this */
419         /* and first order contribution */
420 
421         /* third order, and part of second order contribution */
422         pref3[k][0] = trY[k][0]*trZ[k][0]*drvs.fRRR[sY|sZ]+
423             yzzy[k][0]*drvs.fRR[sY^sZ] +
424             2*(drvs.fRRZ[sZ][sY]*trY[k][0]*trZsum +
425                drvs.fRRZ[sY][sZ]*trZ[k][0]*trYsum) +
426                4*trZsum*trYsum*drvs.fRZZ[sY^sZ][sY] +
427                2*(trYtimesZ + trYZZYsum)*drvs.fRZ[sY^sZ];
428 
429         pref3[k][0] += drvs.fRRG[sY]*trY[k][0]*trZsum*(1+sgn[sZ])+
430             drvs.fRRG[sZ]*trZ[k][0]*trYsum*(1+sgn[sY])+
431             0.5*drvs.fRG[0]*(1+sgn[sY]*sgn[sZ])*trYZZYsum +
432             0.5*drvs.fRG[0]*(sgn[sY]+sgn[sZ])*trYtimesZ;
433 
434         pref2b[k][0] = trY[k][0]*drvs.fRR[sY] + 2*trYsum*drvs.fRZ[sY]
435             + trYsum*drvs.fRG[sY];
436 
437         pref2c[k][0] = trZ[k][0]*drvs.fRR[sZ] + 2*trZsum*drvs.fRZ[sZ]
438             + trZsum*drvs.fRG[sZ];
439 
440 
441         a = trY[k][0]*(2*drvs.fRZ[sY]+drvs.fRG[sY])+4*trYsum*drvs.fZZ[sY];
442         b = 2*drvs.fZ + sgn[sY]*drvs.fG;
443         b *= 2;
444 
445 	pref2b[k][1] = a*grad[k][0] + b*trY[k][1];
446 	pref2b[k][2] = a*grad[k][1] + b*trY[k][2];
447 	pref2b[k][3] = a*grad[k][2] + b*trY[k][3];
448 
449 
450         a = trZ[k][0]*(2*drvs.fRZ[sZ]+drvs.fRG[sZ])+4*trZsum*drvs.fZZ[sZ];
451         b = 2*drvs.fZ + sgn[sZ]*drvs.fG;
452         b *= 2;
453 
454 	pref2c[k][1] = a*grad[k][0] + b*trZ[k][1];
455 	pref2c[k][2] = a*grad[k][1] + b*trZ[k][2];
456 	pref2c[k][3] = a*grad[k][2] + b*trZ[k][3];
457 
458         /* xyz components of pref3 */
459 	a = (8*drvs.fZZZ[sY|sZ]*trYsum*trZsum +
460              4*(drvs.fRZZ[sY][sZ]*trY[k][0]*trZsum +
461                 drvs.fRZZ[sZ][sY]*trZ[k][0]*trYsum)+
462              2*drvs.fRRZ[sY^sZ][sY]*trY[k][0]*trZ[k][0]);
463         a += drvs.fRRGX[sY][sZ]*trY[k][0]*trZ[k][0]
464             + 4*drvs.fZZ[sY^sZ]*trYZZYsum
465             + 4*drvs.fZZ[sY^sZ]*trYtimesZ
466             + 2*drvs.fRZ[sY^sZ]*yzzy[k][0];
467         a += drvs.fRG[sY^sZ]*yzzy[k][0];
468 
469         b = 4*drvs.fZZ[sY]*trYsum + 2*drvs.fRZ[sY]*trY[k][0]
470             + drvs.fRG[sY]*trY[k][0]*sgn[sZ];
471         c = 4*drvs.fZZ[sZ]*trZsum + 2*drvs.fRZ[sZ]*trZ[k][0]
472             + drvs.fRG[sZ]*trZ[k][0]*sgn[sY];
473         d = 2*drvs.fZ + sgn[sY]*sgn[sZ]*drvs.fG;
474         b *= 2; c *= 2; d *= 2;
475 
476         pref3[k][1] = a*grad[k][0] + b*trZ[k][1] + c*trY[k][1] + d*yzzy[k][1];
477         pref3[k][2] = a*grad[k][1] + b*trZ[k][2] + c*trY[k][2] + d*yzzy[k][2];
478         pref3[k][3] = a*grad[k][2] + b*trZ[k][3] + c*trY[k][3] + d*yzzy[k][3];
479     }
480 
481     for(isym=0; isym<grid->nsym; isym++) {
482         integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
483         integer ibl_cnt = grid->bas_bl_cnt[isym];
484         for(ibl=0; ibl<ibl_cnt; ibl++)
485             for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
486                 real * RESTRICT a0 = aos + i*bllen;
487                 real * RESTRICT ax = aox + i*bllen;
488                 real * RESTRICT ay = aoy + i*bllen;
489                 real * RESTRICT az = aoz + i*bllen;
490                 real * RESTRICT vyi = d->vy + i*bllen;
491                 real * RESTRICT vzi = d->vz + i*bllen;
492                 real * RESTRICT tmi = tmp + i*bllen;
493                 for(k=blstart; k<blend; k++) {
494                     vzi[k] = -(pref2b[k][0]*a0[k] + pref2b[k][1]*ax[k] +
495                                pref2b[k][2]*ay[k] + pref2b[k][3]*az[k]);
496                     vyi[k] = -(pref2c[k][0]*a0[k] + pref2c[k][1]*ax[k] +
497                                pref2c[k][2]*ay[k] + pref2c[k][3]*az[k]);
498                     tmi[k] =  (pref3 [k][0]*a0[k] + pref3 [k][1]*ax[k] +
499                                pref3 [k][2]*ay[k] + pref3 [k][3]*az[k]);
500                 }
501             }
502         /* Time to distribute computed prefactors. In principle, we
503         should have three separate loops because three generated
504         matrices may have different symmetries. On the other hand, it
505         is profitable to detect the commonly occuring case of same
506         symmetry and collapse the loops to reduce the overhead.
507         1:special case when all three vectors have same symmetry */
508         if(can_collapse_loops) {
509             integer nbast = inforb_.nbast;
510             integer jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
511             integer jbl_cnt = grid->bas_bl_cnt[jsym];
512             for(ibl=0; ibl<ibl_cnt; ibl++) {
513                 for(jbl=0; jbl<jbl_cnt; jbl++) {
514                     integer lenbl = blend - blstart;
515                     integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
516                     integer ilen = iblocks[ibl][1] - iblocks[ibl][0] + 1;
517                     integer jlen = jblocks[jbl][1] - jblocks[jbl][0] + 1;
518 
519                     dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
520                     aos + (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
521                     tmp + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
522                     &ONER, om + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
523 
524                     dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
525                     aos +   (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
526                     d->vy + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
527                     &ONER, omY + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
528 
529                     dgemm_("T","N", &jlen, &ilen, &lenbl, &ONER,
530                     aos +   (jblocks[jbl][0] - 1)*bllen+blstart, &bllen,
531                     d->vz + (iblocks[ibl][0] - 1)*bllen+blstart, &bllen,
532                     &ONER, omZ + (iblocks[ibl][0]-1)*nbast+jblocks[jbl][0]-1, &nbast);
533                 }
534             }
535             continue; /* no need to do the general case here */
536         }
537         /* 2:general case */
538         for(ibl=0; ibl<ibl_cnt; ibl++) {
539             for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
540                 integer ioff = i*inforb_.nbast;
541                 real sum;
542                 real * RESTRICT tmpi = tmp  + i*bllen;
543                 real * RESTRICT vyi = d->vy + i*bllen;
544                 real * RESTRICT vzi = d->vz + i*bllen;
545                 integer jsym = inforb_.muld2h[d->symYZ-1][isym]-1;
546                 integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
547                 integer jbl_cnt = grid->bas_bl_cnt[jsym];
548                 for(jbl=0; jbl<jbl_cnt; jbl++) {
549                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
550                         real * RESTRICT aosj = aos + j*bllen;
551                         sum = 0;
552                         for(k=blstart; k<blend; k++) sum += aosj[k]*tmpi[k];
553                         om [j+ioff] += sum;
554                     }
555                 }
556                 jsym = inforb_.muld2h[d->symZ-1][isym]-1;
557                 jblocks = BASBLOCK(grid,jsym);
558                 jbl_cnt = grid->bas_bl_cnt[jsym];
559                 for(jbl=0; jbl<jbl_cnt; jbl++) {
560                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
561                         real * RESTRICT aosj = aos + j*bllen;
562                         sum = 0;
563                         for(k=blstart; k<blend; k++) sum += aosj[k]*vyi[k];
564                         omY[j+ioff] += sum;
565                     }
566                 }
567                 jsym = inforb_.muld2h[d->symY-1][isym]-1;
568                 jblocks = BASBLOCK(grid,jsym);
569                 jbl_cnt = grid->bas_bl_cnt[jsym];
570                 for(jbl=0; jbl<jbl_cnt; jbl++) {
571                     for(j=jblocks[jbl][0]-1; j<jblocks[jbl][1]; j++) {
572                         real * RESTRICT aosj = aos + j*bllen;
573                         sum = 0;
574                         for(k=blstart; k<blend; k++) sum += aosj[k]*vzi[k];
575                         omZ[j+ioff] += sum;
576                     }
577                 }
578             }
579         }
580     }
581 }
582 
583 static void
qrbl_gga_cb(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,QuadBlData * data)584 qrbl_gga_cb(DftIntegratorBl* grid, real * RESTRICT tmp,
585 	    integer bllen, integer blstart, integer blend,
586 	    QuadBlData* data)
587 {
588     qrbl_eval_gga_vars       (grid, data, tmp, bllen, blstart, blend);
589     qrbl_add_gga_contribution(grid, data, tmp, bllen, blstart, blend);
590 }
591 
592 /* ===================================================================
593  *                   Parallel section
594  * =================================================================== */
595 
596 #if defined(VAR_MPI)
597 #include <mpi.h>
598 #include <our_extra_mpi.h>
599 #define MASTER_NO 0
600 /* dft_qr_faster_slave:
601    this is a slave driver. It's task is to allocate memory needed by
602    the main property evaluator (dftqrcf_ in this case) and call it.
603 */
604 void
dft_qrbl_slave(real * work,integer * lwork,integer * iprint)605 dft_qrbl_slave(real* work, integer* lwork, integer* iprint)
606 {
607     real* fi    = malloc(inforb_.n2basx*sizeof(real));              /* OUT */
608     real *cmo   = malloc(inforb_.norbt*inforb_.nbast*sizeof(real)); /* IN  */
609     real *kappaY= malloc(inforb_.n2orbx*sizeof(real));              /* IN  */
610     real *kappaZ= malloc(inforb_.n2orbx*sizeof(real));              /* IN  */
611     integer addfock, symY, symZ, spinY, spinZ;                      /* IN  */
612     FSYM2(dft_qr_respons)(fi, cmo, kappaY, &symY, &spinY, kappaZ, &symZ, &spinZ,
613 			  &addfock, work, lwork, iprint);
614     free(kappaZ);
615     free(kappaY);
616     free(cmo);
617     free(fi);
618 }
619 
620 static void
qrbl_sync_slaves(real * cmo,real * kappaY,real * kappaZ,integer * addfock,integer * symY,integer * symZ,integer * spinY,integer * spinZ)621 qrbl_sync_slaves(real* cmo, real* kappaY, real* kappaZ, integer* addfock,
622 		 integer* symY, integer* symZ, integer* spinY, integer* spinZ)
623 {
624     static const SyncData sync_data[] = {
625  	{ &inforb_.nocc,  8, fortran_MPI_INT },
626  	{ &inforb_.nocct, 1, fortran_MPI_INT },
627  	{ &inforb_.nvirt, 1, fortran_MPI_INT },
628     };
629  #ifdef C99_COMPILER
630      const SyncData data2[] = {
631  	{ cmo,     &inforb_.norbt*inforb_.nbast,MPI_DOUBLE },
632  	{ kappaY,  &inforb_.n2orbx,             MPI_DOUBLE },
633  	{ kappaZ,  &inforb_.n2orbx,             MPI_DOUBLE },
634  	{ addfock, 1,                          fortran_MPI_INT    },
635  	{ symY,    1,                          fortran_MPI_INT    },
636  	{ symZ,    1,                          fortran_MPI_INT    },
637  	{ spinY,   1,                          fortran_MPI_INT    },
638  	{ spinZ,   1,                          fortran_MPI_INT    }
639      };
640  #else /* C99_COMPILER */
641     /* this is more error-prone but some compilers (HP/UX)... */
642     static SyncData data2[] =
643     { {NULL, 0, MPI_DOUBLE}, {NULL, 0, MPI_DOUBLE}, {NULL, 0, MPI_DOUBLE},
644       {NULL, 0, fortran_MPI_INT   }, {NULL, 0, fortran_MPI_INT   }, {NULL, 0, fortran_MPI_INT   },
645       {NULL, 0, fortran_MPI_INT   }, {NULL, 0, fortran_MPI_INT   } };
646     data2[0].data = cmo;     data2[0].count = inforb_.norbt*inforb_.nbast;
647     data2[1].data = kappaY;  data2[1].count = inforb_.n2orbx;
648     data2[2].data = kappaZ;  data2[2].count = inforb_.n2orbx;
649     data2[3].data = addfock; data2[3].count = 1;
650     data2[4].data = symY;    data2[4].count = 1;
651     data2[5].data = symZ;    data2[5].count = 1;
652     data2[6].data = spinY;   data2[6].count = 1;
653     data2[7].data = spinZ;   data2[7].count = 1;
654  #endif /* C99_COMPILER */
655 
656     mpi_sync_data(sync_data, ELEMENTS(sync_data));
657     mpi_sync_data(data2,     ELEMENTS(data2));
658 }
659 static __inline__ void
qrbl_collect_info(real * fi,real * work,integer lwork)660 qrbl_collect_info(real* fi, real*work, integer lwork)
661 {
662     int sz = 0;
663     MPI_Comm_size(MPI_COMM_WORLD, &sz);
664     if(sz<=1) return;
665 
666     CHECK_WRKMEM(inforb_.n2orbx,lwork);
667     dcopy_(&inforb_.n2orbx, fi, &ONEI, work, &ONEI);
668     integer n2orbx_int = inforb_.n2orbx;
669     MPI_Reduce(work, fi, n2orbx_int, MPI_DOUBLE, MPI_SUM,
670 	       MASTER_NO, MPI_COMM_WORLD);
671 }
672 #else /* VAR_MPI */
673 #define qrbl_sync_slaves(cmo,kappaY,kappaZ,addfck,symY,symZ,spinY,spinZ)
674 #define qrbl_collect_info(fi,work,lwork)
675 #endif /* VAR_MPI */
676 
677 /* ===================================================================
678  *                   Main driver
679  * =================================================================== */
680 void
FSYM2(dft_qr_respons)681 FSYM2(dft_qr_respons)(real *fi, real *cmo,
682                       real *kappaY, integer *symY, integer *spinY,
683                       real *kappaZ, integer *symZ, integer *spinZ,
684                       integer *addfock, real *work, integer *lwork, integer *iprint)
685 {
686     static integer msg_printed = 0;
687     struct tms starttm, endtm; clock_t utm;
688     real electrons;
689     QuadBlData qr_data; /* quadratic response data */
690     integer i, j;
691     real *dmat;
692     real *tmp1, *tmp2;
693     DftBlockCallback cb;
694 
695     /* WARNING: NO work MAY BE done before syncing slaves! */
696     dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_qr_respons)); /* NO-OP in serial */
697     qrbl_sync_slaves(cmo,kappaY,kappaZ,addfock,
698 		         symY,symZ, spinY,spinZ);
699     if(*addfock) {/* call the old, slow version here */
700 #ifdef VAR_MPI
701 	fort_print("This was supposed not to be called.");
702 	dalton_quit("This calculation not reported to work in parallel.");
703 #endif
704         dftqrcf_(fi, cmo, kappaY, symY, spinY,
705                  kappaZ, symZ, spinZ, addfock,
706                  work, lwork, iprint);
707         return;
708     }
709     dmat = dal_malloc(inforb_.n2basx*sizeof(real));
710     if(!msg_printed) {
711         fort_print("DFT-QR computed in a linearly-scaling fashion.\n");
712         msg_printed = 1;
713     }
714     times(&starttm);
715     FSYM2(dft_get_ao_dens_mat)(cmo, dmat, work, lwork);
716     qrbl_data_init(&qr_data, cmo, selected_func->is_gga(), DFT_BLLEN,
717 		   kappaY, *symY, *spinY,
718 		   kappaZ, *symZ, *spinZ,
719 		   dmat, work, lwork);
720     cb = (DftBlockCallback)
721         (qr_data.is_gga ? qrbl_gga_cb : qrbl_lda_cb );
722     electrons = dft_integrate_ao_bl(1, dmat, work, lwork, iprint, 0,
723                                     cb, &qr_data);
724     free(dmat);
725     if(DFTQR_DEBUG) {
726         fort_print("DFT quadratic contribution (AO)");
727         outmat_(qr_data.res_omega, &ONEI, &inforb_.nbast, &ONEI,
728                 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
729         outmat_(qr_data.res_omY, &ONEI, &inforb_.nbast, &ONEI,
730                 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
731         outmat_(qr_data.res_omZ, &ONEI, &inforb_.nbast, &ONEI,
732                 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
733     }
734     /* LDA callback computes only half and GGA needs to be symmetrized */
735     for(i=0; i<inforb_.nbast; i++) {
736         for(j=0; j<i; j++) {
737             integer ji = j + i*inforb_.nbast;
738             integer ij = i + j*inforb_.nbast;
739             real avg = 0.5*(qr_data.res_omega[ij]+qr_data.res_omega[ji]);
740             qr_data.res_omega[ij] = qr_data.res_omega[ji] = avg;
741             avg = 0.5*(qr_data.res_omY[ij] + qr_data.res_omY[ji]);
742             qr_data.res_omY[ij] = qr_data.res_omY[ji] = avg;
743             avg = 0.5*(qr_data.res_omZ[ij] + qr_data.res_omZ[ji]);
744             qr_data.res_omZ[ij] = qr_data.res_omZ[ji] = avg;
745         }
746     }
747     /* Transform to MO Fock matrix contribution: fix the symmetry of
748      * the result!  */
749     tmp1 = dal_malloc(inforb_.n2orbx*sizeof(real));
750     tmp2 = dal_malloc(inforb_.n2orbx*sizeof(real));
751     dzero_(tmp1, &inforb_.n2orbx);
752     FSYM(lrao2mo)(cmo, &qr_data.symYZ, qr_data.res_omega, tmp1, work, lwork);
753     /* compute commutators and add them */
754     dzero_(tmp2, &inforb_.n2orbx);
755     FSYM(lrao2mo)(cmo, &qr_data.symZ, qr_data.res_omY, tmp2, work, lwork);
756     commute_matrices(inforb_.norbt, tmp2, kappaY, tmp1, 1);
757     dzero_(tmp2, &inforb_.n2orbx);
758     FSYM(lrao2mo)(cmo, &qr_data.symY, qr_data.res_omZ, tmp2, work, lwork);
759     commute_matrices(inforb_.norbt, tmp2, kappaZ, tmp1, 1);
760     qrbl_collect_info(tmp1, work, *lwork);  /* NO-OP in serial */
761     daxpy_(&inforb_.n2orbx, &HALFR, tmp1, &ONEI, fi, &ONEI);
762     if(DFTQR_DEBUG) {
763         fort_print("DFT quadratic contribution (MO)");
764         outmat_(fi, &ONEI, &inforb_.norbt, &ONEI, &inforb_.norbt,
765                 &inforb_.norbt, &inforb_.norbt);
766     }
767     free(tmp1); free(tmp2);
768     qrbl_data_free(&qr_data);
769     if (*iprint>0) {
770       times(&endtm);
771       utm = endtm.tms_utime-starttm.tms_utime;
772       fort_print("      Electrons: %f(%9.3g): QR-DFT/b evaluation time: %9.1f s",
773                  electrons, (double)(electrons-(integer)(electrons+0.5)),
774                  utm/(double)sysconf(_SC_CLK_TCK));
775     }
776 }
777