1 /*
2 
3 
4 !
5 !  Dalton, a molecular electronic structure program
6 !  Copyright (C) by the authors of Dalton.
7 !
8 !  This program is free software; you can redistribute it and/or
9 !  modify it under the terms of the GNU Lesser General Public
10 !  License version 2.1 as published by the Free Software Foundation.
11 !
12 !  This program is distributed in the hope that it will be useful,
13 !  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 !  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 !  Lesser General Public License for more details.
16 !
17 !  If a copy of the GNU LGPL v2.1 was not distributed with this
18 !  code, you can obtain one at https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html.
19 !
20 
21 !
22 
23 */
24 /*-*-mode: C; c-indentation-style: "bsd"; c-basic-offset: 4; -*-*/
25 /* The DFT Propery evaluators.
26    (c) Pawel Salek, pawsa@theochem.kth.se.
27    2002.04.05
28 
29    This file is written in C because it is easier to write portable
30    programs in C than in Fortran due to more strict syntax checking and
31    existing language standard that is actually obeyed by compiler writers
32    (not to mention language features).
33 
34    This module evaluates DFT contribution to different properties,
35    in particular:
36    a). DFT contribution to the Fock/KS matrix (dft_kohn_sham)
37    b). DFT contribution to the linear response (dft_lin_resp)
38    c). DFT contribution to the molecular gradient (dft_mol_grad)
39 */
40 /* strictly conform to XOPEN ANSI C standard */
41 #define _XOPEN_SOURCE          500
42 #define _XOPEN_SOURCE_EXTENDED 1
43 
44 #include <assert.h>
45 #include <math.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <time.h>
49 #include <sys/times.h>
50 #include <unistd.h>
51 
52 #define __CVERSION__
53 #include "general.h"
54 #include "integrator.h"
55 #include "functionals.h"
56 
57 #include "inforb.h"
58 
59 const static integer KOHNSH_DEBUG = 0;
60 const static integer DFTLR_DEBUG  = 0;
61 const static integer DFTMAG_DEBUG = 0;
62 
63 #if defined(VAR_MPI)
64 #include <mpi.h>
65 #include <our_extra_mpi.h>
66 #define MASTER_NO 0
67 #endif
68 #if 0 && defined(VAR_MPI)
69 
70 /* dft_kohn_sham_slave:
71    this is a slave driver. It's task is to allocate memory needed by
72    the main property evaluator (dft_kohn_sham in this case) and call it.
73 */
74 void
75 dft_kohn_sham_slave(real* work, integer* lwork, integer* iprint)
76 {
77     real* dmat = malloc(inforb_.n2basx*sizeof(real));
78     real* ksm  = calloc(inforb_.n2basx,sizeof(real));
79     integer iprint = 0;
80     dft_kohn_sham_(dmat, ksm, work, lwork, &iprint);
81     free(dmat);
82     free(ksm);
83 }
84 
85 static __inline__ void
86 dft_kohn_sham_sync_slaves(real* dmat)
87 {
88     MPI_Bcast(dmat, inforb_.n2basx,MPI_DOUBLE,
89 	      MASTER_NO, MPI_COMM_WORLD);
90 }
91 
92 static __inline__ void
93 dft_kohn_sham_collect_info(real*ksm, real* energy, real* work, integer lwork)
94 {
95     real tmp = *energy;
96     integer sz = 0;
97     MPI_Comm_size(MPI_COMM_WORLD, &sz);
98     if(sz <=1) return;
99     CHECK_WRKMEM(inforb_.n2basx, lwork);
100     dcopy_(&inforb_.n2basx, ksm,&ONEI, work, &ONEI);
101     MPI_Reduce(work, ksm, inforb_.n2basx, MPI_DOUBLE, MPI_SUM,
102 	       MASTER_NO, MPI_COMM_WORLD);
103     MPI_Reduce(&tmp, energy, 1, MPI_DOUBLE, MPI_SUM,
104 	       MASTER_NO, MPI_COMM_WORLD);
105 }
106 
107 #else /* VAR_MPI */
108 #define dft_kohn_sham_sync_slaves(dmat)
109 #define dft_kohn_sham_collect_info(myksm, ksm, energy,lwork)
110 #endif /* VAR_MPI */
111 
112 static real energy = 0.0;
113 
114 /* ------------------------------------------------------------------- */
115 /* ---------- DFT LINEAR RESPONSE CONTRIBUTION EVALUATOR ------------- */
116 /* ------------------------------------------------------------------- */
117 void FSYM(deq27)(const real* cmo, const real* ubo, const real* dv,
118                  real* dxcao, real* dxvao, real* wrk, integer* lfrsav);
119 void FSYM(autpv)(const integer*isym,const integer*jsym,
120 		 const real*u,const real* v,
121                  real*prpao, const integer*nbas,const integer*nbast, real*prpmo,
122                  const integer*norb, const integer*norbt,
123 		 real* wrk,integer* lwrk);
124 
125 typedef struct {
126     real* dmat, *kappa, *res;
127     real* dtgao;
128     integer   trplet, ksymop;
129 } LinRespData;
130 
131 
132 static __inline__ real
min(real a,real b)133 min(real a, real b)
134 { return a>b ? b : a; }
135 static __inline__ real
max(real a,real b)136 max(real a, real b)
137 { return a>b ? a : b; }
138 
139 /* dft_lin_resp_slave:
140    this is a slave driver. It's task is to allocate memory needed by
141    the main property evaluator (dft_lin_resp in this case) and call
142    it.  The computed values are collected in the evaluater using
143    dft_lin_resp_collect info and we can just drop the arrays on the
144    floor when we are done.
145    FIXME: we can use work for the temp data....
146 */
147 #if defined(VAR_MPI)
148 
149 extern void FSYM(dftlrsync)(void);
150 static void
dft_lin_resp_sync_slaves(real * cmo,integer * nvec,real ** zymat,integer * trplet,integer * ksymop,real ** work,integer lwork)151 dft_lin_resp_sync_slaves(real* cmo, integer *nvec, real **zymat,
152                          integer* trplet, integer* ksymop,
153                          real **work, integer lwork)
154 {
155     static SyncData data2[] =
156     { {NULL, 0, MPI_DOUBLE}, {NULL, 0, MPI_DOUBLE}, {NULL, 0, fortran_MPI_INT},
157       {NULL, 0, fortran_MPI_INT} };
158     MPI_Bcast(nvec,  1, fortran_MPI_INT, MASTER_NO, MPI_COMM_WORLD);
159     FSYM(dftlrsync)();
160     if(*zymat == NULL) { /* we are a slave */
161         if(lwork<*nvec*inforb_.n2orbx)
162             dalton_quit("%s:  slave needs %d words for ZYMAT, available %d",
163                         __FUNCTION__, *nvec*inforb_.n2orbx, lwork);
164 	*zymat = *work;
165         *work += *nvec*inforb_.n2orbx;
166     }
167     data2[0].data = cmo;    data2[0].count = inforb_.ncmot;
168     data2[1].data = *zymat; data2[1].count = *nvec*inforb_.n2orbx;
169     data2[2].data = trplet; data2[2].count = 1;
170     data2[3].data = ksymop; data2[3].count = 1;
171     mpi_sync_data(data2, ELEMENTS(data2));
172 }
173 
174 static __inline__ void
dft_lin_resp_collect_info(integer nvec,real * fmat,real * work,integer lwork)175 dft_lin_resp_collect_info(integer nvec, real* fmat, real*work, integer lwork)
176 {
177     int sz_mpi = 0;
178     MPI_Comm_size(MPI_COMM_WORLD, &sz_mpi);
179     if(sz_mpi<=1) return;
180 
181     integer sz = nvec*inforb_.n2basx;
182     sz_mpi = sz;
183     if(sz>lwork) {
184         CHECK_WRKMEM(inforb_.n2basx,lwork);
185         for(sz=0; sz<nvec; sz++) {
186             dcopy_(&inforb_.n2basx, fmat + sz*inforb_.n2basx,
187                    &ONEI, work, &ONEI);
188             integer len_for_reduce = inforb_.n2basx;
189             MPI_Reduce(work, fmat+sz*inforb_.n2basx, len_for_reduce,
190                        MPI_DOUBLE, MPI_SUM, MASTER_NO, MPI_COMM_WORLD);
191         }
192     } else {
193         CHECK_WRKMEM(sz, lwork);
194         dcopy_(&sz, fmat, &ONEI, work, &ONEI);
195         MPI_Reduce(work, fmat, sz_mpi, MPI_DOUBLE, MPI_SUM,
196                    MASTER_NO, MPI_COMM_WORLD);
197     }
198 }
199 
200 #else  /* VAR_MPI */
201 #define dft_lin_resp_sync_slaves(cmo,nvec,zymat,trplet,ksymop,work,lwork)
202 #define dft_lin_resp_collect_info(nvec,fmat,work,lwork)
203 #endif /* VAR_MPI */
204 
205 static void
lin_resp_cb(DftGrid * grid,LinRespData * data)206 lin_resp_cb(DftGrid* grid, LinRespData* data)
207 {
208     real b0, b3[3];
209     integer isym, i, j;
210     SecondDrv vxc;
211 
212     dgemv_("N",&inforb_.nbast,&inforb_.nbast,&ONER,
213 	   data->kappa,&inforb_.nbast,grid->atv,&ONEI,&ZEROR,
214 	   data->dtgao,&ONEI);
215     b0 = ddot_(&inforb_.nbast,data->dtgao,&ONEI,grid->atv,&ONEI);
216     if(grid->dogga) {
217 	real brg, brz, facr, facg, bmax, vt[4];
218 	real *atvX = &grid->atv[inforb_.nbast  ];
219 	real *atvY = &grid->atv[inforb_.nbast*2];
220 	real *atvZ = &grid->atv[inforb_.nbast*3];
221         real ngrad = (grid->dp.grada + grid->dp.gradb);
222 	/* B3 = GAO1'*DTGAO; */
223 	dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX, &inforb_.nbast,
224 	       data->dtgao,&ONEI, &ZEROR, b3, &ONEI);
225 	/* DTGAO= DTRMAT'*GAO */
226 	dgemv_("T",&inforb_.nbast, &inforb_.nbast, &ONER, data->kappa,
227 	       &inforb_.nbast,grid->atv,&ONEI,&ZEROR,data->dtgao,&ONEI);
228 	/*  B3 = B3 + GAO1'*DTGAO */
229 	dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX,
230 	       &inforb_.nbast,data->dtgao,&ONEI,&ONER,b3,&ONEI);
231 	bmax = max(fabs(b0),max(fabs(b3[0]),max(fabs(b3[1]),fabs(b3[2]))));
232 	if(bmax<=grid->dfthri) return;
233 	brg = (b3[0]*grid->grada[0] +
234                b3[1]*grid->grada[1] +
235                b3[2]*grid->grada[2])*2;
236         brz = brg/ngrad;
237 
238 	dftpot1_(&vxc, &grid->curr_weight, &grid->dp, &data->trplet);
239         facr = vxc.fRZ*b0 + (vxc.fZZ-vxc.fZ/ngrad)*brz + vxc.fZG*brg;
240         facr = 2*(facr/ngrad + (vxc.fRG*b0+vxc.fZG*brz +vxc.fGG*brg));
241         facg = vxc.fZ/ngrad + vxc.fG;
242         vt[0] = vxc.fRR*b0 + vxc.fRZ*brz+ vxc.fRG*brg;
243         vt[1] = facr*grid->grada[0] + facg*b3[0];
244         vt[2] = facr*grid->grada[1] + facg*b3[1];
245         vt[3] = facr*grid->grada[2] + facg*b3[2];
246 	for(isym=0; isym<inforb_.nsym; isym++) {
247 	    integer istr = inforb_.ibas[isym];
248 	    integer iend = inforb_.ibas[isym] + inforb_.nbas[isym];
249 	    integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
250 	    if (isym>=jsym) {
251 		integer jstr = inforb_.ibas[jsym];
252 		integer jend = inforb_.ibas[jsym] + inforb_.nbas[jsym];
253 		for(i=istr; i<iend; i++) {
254 		    real g0 = grid->atv[i];
255 		    real gx = atvX[i];
256 		    real gy = atvY[i];
257 		    real gz = atvZ[i];
258 		    integer ioff = i*inforb_.nbast;
259                     integer jtop = min(jend,i+1);
260 		    for(j=jstr; j<jtop; j++) {
261 			real a0 = g0*grid->atv[j];
262 			real ax = gx*grid->atv[j] + g0*atvX[j];
263 			real ay = gy*grid->atv[j] + g0*atvY[j];
264 			real az = gz*grid->atv[j] + g0*atvZ[j];
265 			data->res[j+ioff] +=
266                             vt[0]*a0 + vt[1]*ax +
267                             vt[2]*ay + vt[3]*az;
268 		    }
269 		}
270 	    }
271 	}
272     } else { /* dogga == FALSE */
273 	real vt;
274 	if(fabs(b0)<=grid->dfthri) return;
275 	dftpot1_(&vxc, &grid->curr_weight, &grid->dp, &data->trplet);
276 	vt = vxc.fRR*b0;
277 	for(isym=0; isym<inforb_.nsym; isym++) {
278 	    integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
279 	    if(isym>=jsym) {
280 		integer istr = inforb_.ibas[isym];
281 		integer iend = inforb_.ibas[isym] + inforb_.nbas[isym];
282 		integer jstr = inforb_.ibas[jsym];
283 		integer jend = inforb_.ibas[jsym] + inforb_.nbas[jsym]-1;
284 		for(i=istr; i<iend; i++) {
285 		    integer ioff = i*inforb_.nbast;
286 		    real gvi = vt*grid->atv[i];
287 		    for(j=min(i,jend); j>=jstr; j--)
288 			data->res[j+ioff] += gvi*grid->atv[j];
289 		}
290 	    }
291 	}
292     }
293 }
294 
295 /* dft_lin_resp_:
296    Main Linear Response evaluator.
297    FMAT(NORBT,NORBT) - result added to FMAT.
298    cmo -const
299    zymat(NORBT,NORBT) - const response vector.
300    trplet - triplet excitation? (bool)
301 */
302 void
FSYM2(dft_lin_resp)303 FSYM2(dft_lin_resp)(real* fmat, real *cmo, real *zymat, integer *trplet,
304 		    integer *ksymop, real* work, integer* lwork, integer* iprint)
305 {
306     static const real DP5R = 0.5;
307     struct tms starttm, endtm; clock_t utm;
308     real electrons;
309     LinRespData lr_data; /* linear response data */
310     DftCallbackData cbdata[1];
311     DftDensity dens = { dft_dens_restricted, NULL, NULL };
312     real dummy;
313     integer nvec1=1;
314     integer i, j;
315 
316     /* WARNING: NO work MAY BE done before syncing slaves! */
317     dft_wake_slaves((DFTPropEvalMaster)dft_lin_resp_);    /* NO-OP in serial */
318     dft_lin_resp_sync_slaves(cmo,&nvec1,&zymat,trplet,ksymop,
319                              &work, *lwork);              /* NO-OP in serial */
320 
321     times(&starttm);
322     lr_data.dmat  = malloc(inforb_.n2basx*sizeof(real));
323     lr_data.res   = calloc(inforb_.n2basx,sizeof(real));
324     lr_data.kappa = calloc(inforb_.n2basx,sizeof(real));
325     lr_data.dtgao = malloc(inforb_.nbast *sizeof(real));
326     lr_data.trplet= *trplet;
327     lr_data.ksymop = *ksymop;
328     dens.dmata = lr_data.dmat;
329 
330     FSYM2(dft_get_ao_dens_mat)(cmo, lr_data.dmat, work, lwork);
331     FSYM(deq27)(cmo,zymat,&dummy,lr_data.kappa,&dummy,work,lwork);
332     dscal_(&inforb_.n2basx,&DP5R,lr_data.kappa,&ONEI);
333     if(DFTLR_DEBUG) {
334         fort_print("kappa matrix in dft_lin_resp");
335         outmat_(lr_data.kappa,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
336                 &inforb_.nbast, &inforb_.nbast);
337     }
338 
339     cbdata[0].callback = (DftCallback)lin_resp_cb;
340     cbdata[0].cb_data  = &lr_data;
341     electrons = dft_integrate_ao(&dens, work, lwork, iprint, 0, 0, 0,
342 				 cbdata, ELEMENTS(cbdata));
343 
344     if(DFTLR_DEBUG) {
345         fort_print("AO Fock matrix contribution in dft_lin_resp");
346         outmat_(lr_data.res,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
347                 &inforb_.nbast, &inforb_.nbast);
348     }
349 
350     for(i=0; i<inforb_.nbast; i++) {
351 	integer ioff = i*inforb_.nbast;
352 	for(j=0; j<i; j++) {
353 	    integer joff = j*inforb_.nbast;
354 	    real averag = lr_data.res[i+joff] + lr_data.res[j+ioff];
355 	    lr_data.res[i+joff] = lr_data.res[j+ioff] = averag;
356 	}
357     }
358 
359     if(DFTLR_DEBUG) {
360         fort_print("MO Fock matrix contribution in dft_lin_resp");
361         outmat_(work,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
362                 &inforb_.norbt, &inforb_.norbt);
363     }
364 
365     /* transform to MO Fock matrix contribution  */
366     FSYM(lrao2mo)(cmo, ksymop, lr_data.res, fmat, work, lwork);
367 
368     /* FIXME: parallelization here! */
369     free(lr_data.dmat);
370     free(lr_data.kappa);
371     free(lr_data.res);
372     free(lr_data.dtgao);
373     if (*iprint>0) {
374       times(&endtm);
375       utm = endtm.tms_utime-starttm.tms_utime;
376       fort_print("      Electrons: %f(%9.1g): LR-DFT eval. time: %9.1f s",
377                  electrons, (electrons-2.0*inforb_.nrhft)/(2.0*inforb_.nrhft),
378                  utm/(double)sysconf(_SC_CLK_TCK));
379     }
380 }
381 
382 /* ------------------------------------------------------------------- */
383 /* ---------- DFT LONDON ORBITAL TRANSFORMATION EVALUATOR ------------ */
384 /* ------------------------------------------------------------------- */
385 void
386 FSYM(dftmag)(real* excmat,const real* x,const real* y,const real* z,
387 	     const real* gao, const real* gao1,const real* gab1,
388 	     const real* gab2,const real* vxc, const real* vxb,
389 	     const real* rh, const integer* dogga,const integer* fromvx);
390 
391 static void
london_cb(DftGrid * grid,real * d)392 london_cb(DftGrid* grid, real* d)
393 {
394     FirstDrv drvs;
395     integer lnd_off = grid->london_off*inforb_.nbast;
396 
397     dftpot0_(&drvs, &grid->curr_weight, &grid->dp);
398     if(grid->dogga) drvs.fZ *= 1.0/grid->dp.grada;
399     /*
400     fort_print("DFTMAG: (%9.4f,%9.4f,%9.4f): RHO=%g %g %g",
401                grid->corx[grid->curr_point],
402                grid->cory[grid->curr_point],
403                grid->corz[grid->curr_point],
404                grid->rho,
405                drvs.df1000, drvs.df0010);
406     */
407     dftmag_(d,&grid->coor[grid->curr_point][0],
408 	    &grid->coor[grid->curr_point][1],
409 	    &grid->coor[grid->curr_point][2],
410 	    grid->atv, &grid->atv[inforb_.nbast],
411 	    &grid->atv[lnd_off],&grid->atv[lnd_off+inforb_.nbast],
412 	    &drvs.fR, &drvs.fZ, grid->grada, &grid->dogga,
413 	    &ZEROI);
414 }
415 
416 void
417 FSYM(stopit)(const char* sub,const char* place,
418 	     const integer* int1, const integer* int2,
419 	     integer sub_len, integer place_len);
420 
421 void
FSYM2(dft_london)422 FSYM2(dft_london)(real* fx, real* fy, real* fz, real* work,
423 		  integer* lwork, integer* iprint)
424 {
425     DftCallbackData cbdata[1];
426     DftDensity dens = { dft_dens_restricted, NULL, NULL };
427     struct tms starttm, endtm; clock_t utm;
428     real electrons;
429     integer i, j;
430     real* new_work, *xcmat, *dmat;
431     integer new_lwork, xc_sz=3*inforb_.n2basx;
432 
433     xcmat    = work;
434     dmat     = work + 3*inforb_.n2basx;
435     new_work = work + 4*inforb_.n2basx;
436     new_lwork = *lwork-4*inforb_.n2basx;
437     if (new_lwork<0) stopit_("DFTLND"," ", lwork, lwork, 6, 1);
438     cbdata[0].callback = (DftCallback)london_cb;
439     cbdata[0].cb_data  = xcmat;
440     dens.dmata = dmat;
441 
442     times(&starttm);
443     FSYM(dftdns)(dmat, work, &new_lwork, iprint);
444     dzero_(xcmat, &xc_sz);
445     electrons = dft_integrate_ao(&dens, new_work, &new_lwork, iprint, 0, 0, 1,
446 				 cbdata, ELEMENTS(cbdata));
447     /* dft_mol_grad_collect_info(work);                  NO-OP in serial */
448     if (*iprint>0) {
449       times(&endtm);
450       utm = endtm.tms_utime-starttm.tms_utime;
451       fort_print("      Electrons: %f (%9.1g): LONDON evaluation time: %10.2f s",
452                  electrons, (double)(electrons-(integer)(electrons+0.5)),
453 	         (double)(utm/(double)sysconf(_SC_CLK_TCK)));
454     }
455 
456     /* post-transformation: dftdrv stage */
457     /*output_(xcmat,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
458       &inforb_.nbast, &inforb_.nbast, &ONEI, &priunt_.lupri); */
459 
460     for(i=0; i<inforb_.nbast; i++)
461 	for(j=0; j<i; j++) {
462             real averag = 0.5*(xcmat[i+j*inforb_.nbast]-
463 			       xcmat[j+i*inforb_.nbast]);
464             xcmat[i+j*inforb_.nbast] =  averag;
465             xcmat[j+i*inforb_.nbast] = -averag;
466 	}
467 
468     if(DFTMAG_DEBUG) {
469         fort_print("DFT LDN contribution (x direction)");
470         outmat_(xcmat,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
471                 &inforb_.nbast, &inforb_.nbast);
472         fort_print("DFT LDN contribution (y direction)");
473         outmat_(xcmat+inforb_.n2basx,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
474                 &inforb_.nbast, &inforb_.nbast);
475         fort_print("DFT LDN contribution (z direction)");
476         outmat_(xcmat+2*inforb_.n2basx,&ONEI,&inforb_.nbast,&ONEI,
477                 &inforb_.nbast, &inforb_.nbast, &inforb_.nbast);
478     }
479 
480     /* post-transformation: dftlnd stage */
481     daxpy_(&inforb_.n2basx, &ONER,xcmat,                 &ONEI,fx,&ONEI);
482     daxpy_(&inforb_.n2basx, &ONER,xcmat+inforb_.n2basx,  &ONEI,fy,&ONEI);
483     daxpy_(&inforb_.n2basx, &ONER,xcmat+inforb_.n2basx*2,&ONEI,fz,&ONEI);
484     if(DFTMAG_DEBUG) {
485         fort_print("FX (x direction)");
486         outmat_(fx,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
487                 &inforb_.nbast, &inforb_.nbast);
488         fort_print("FY (y direction)");
489         outmat_(fy,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
490                 &inforb_.nbast, &inforb_.nbast);
491         fort_print("FZ (z direction)");
492         outmat_(fz,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
493                 &inforb_.nbast, &inforb_.nbast);
494     }
495 
496 }
497 
498 
499 /* ================================================================== */
500 /* Open shell versions of the dft property evaluators                 */
501 /* ================================================================== */
502 /* ZR alpha/beta version */
503 typedef struct {
504     real *ksma, *ksmb;
505     real energy;
506 } DftKohnShamU;
507 
508 #if defined(VAR_MPI)
509 /* dft_kohn_sham_ab_slave:
510    this is a slave driver. It's task is to allocate memory needed by
511    the main property evaluator (dft_kohn_shamab in this case) and call it.
512 */
513 void
dft_kohn_shamab_slave(real * work,integer * lwork,integer * iprint)514 dft_kohn_shamab_slave(real* work, integer* lwork, integer* iprint)
515 {
516     real* dmat = malloc(2*inforb_.n2basx*sizeof(real));
517     real* ksm  = calloc(2*inforb_.n2basx,sizeof(real));
518     real edfty;
519     FSYM2(dft_kohn_shamab)(dmat, ksm, &edfty, work, lwork, iprint);
520     free(dmat);
521     free(ksm);
522 }
523 
524 static __inline__ void
dft_kohn_shamab_sync_slaves(real * dmat)525 dft_kohn_shamab_sync_slaves(real* dmat)
526 {
527     MPI_Bcast(dmat, 2*inforb_.n2basx,MPI_DOUBLE,
528 	      MASTER_NO, MPI_COMM_WORLD);
529 }
530 
531 static __inline__ void
dft_kohn_shamab_collect_info(real * ksm,real * energy,real * work,integer lwork)532 dft_kohn_shamab_collect_info(real*ksm, real* energy, real* work, integer lwork)
533 {
534     real tmp = *energy;
535     int sz = 0;
536     MPI_Comm_size(MPI_COMM_WORLD, &sz);
537     if(sz<=1) return;
538 
539     sz = 2*inforb_.n2basx;
540     integer sz_ftn = sz;
541     CHECK_WRKMEM(sz,lwork);
542     dcopy_(&sz_ftn, ksm, &ONEI, work, &ONEI);
543     MPI_Reduce(work, ksm, sz, MPI_DOUBLE, MPI_SUM,
544 	       MASTER_NO, MPI_COMM_WORLD);
545     MPI_Reduce(&tmp, energy,  1, MPI_DOUBLE, MPI_SUM,
546 	       MASTER_NO, MPI_COMM_WORLD);
547 }
548 
549 #else /* VAR_MPI */
550 #define dft_kohn_shamab_sync_slaves(dmat)
551 #define dft_kohn_shamab_collect_info(myksm, ksm, energy,lwork)
552 #endif /* VAR_MPI */
553 
554 static void
kohn_shamab_cb(DftGrid * grid,DftKohnShamU * exc)555 kohn_shamab_cb(DftGrid* grid, DftKohnShamU* exc)
556 {
557     FunFirstFuncDrv drvs;
558     integer isym, i, j;
559 
560     drv1_clear(&drvs);
561     exc->energy += selected_func->func(&grid->dp)*grid->curr_weight;
562     if(grid->dp.rhob<1e-20) grid->dp.rhob = 1e-20;
563     if(grid->dp.gradb<1e-20) grid->dp.gradb = 1e-20;
564     if(grid->dp.gradab<1e-20) /* so small that we can ignore direction */
565       grid->dp.gradab = 1e-20;
566     selected_func->first(&drvs, grid->curr_weight, &grid->dp);
567     if(isnan(drvs.df1000) || isnan(drvs.df0100) || isinf(drvs.df0100) ||
568        isnan(drvs.df0010) || isnan(drvs.df0001) ||
569        isnan(drvs.df00001)) {
570       fort_print("AAA: %g %g %g %g %g => %g %g %g %g %g",
571                  grid->dp.rhoa, grid->dp.grada,
572                  grid->dp.rhob, grid->dp.gradb, grid->dp.gradab,
573                  drvs.df1000, drvs.df0100, drvs.df0010, drvs.df0001,
574                  drvs.df00001);
575     }
576     if(grid->dogga) {
577 	real *atvX = &grid->atv[inforb_.nbast];
578 	real *atvY = &grid->atv[inforb_.nbast*2];
579 	real *atvZ = &grid->atv[inforb_.nbast*3];
580 	real fxa, fya, fza, fxb, fyb, fzb;
581         /* add epsilon to avoid division by zero */
582         real grada = fabs(grid->dp.grada)>1e-40 ? grid->dp.grada : 1e-40;
583         real gradb = fabs(grid->dp.gradb)>1e-40 ? grid->dp.gradb : 1e-40;
584         /* alpha  */
585 	drvs.df0010 *= 2.0/grada;
586 	fxa = drvs.df0010*grid->grada[0]+2.0*drvs.df00001*grid->gradb[0];
587 	fya = drvs.df0010*grid->grada[1]+2.0*drvs.df00001*grid->gradb[1];
588 	fza = drvs.df0010*grid->grada[2]+2.0*drvs.df00001*grid->gradb[2];
589         /*beta */
590 	drvs.df0001 *= 2.0/gradb;
591 	fxb = drvs.df0001*grid->gradb[0]+2.0*drvs.df00001*grid->grada[0];
592 	fyb = drvs.df0001*grid->gradb[1]+2.0*drvs.df00001*grid->grada[1];
593 	fzb = drvs.df0001*grid->gradb[2]+2.0*drvs.df00001*grid->grada[2];
594 
595 	for(isym=0; isym<inforb_.nsym; isym++) {
596 	    integer istr = inforb_.ibas[isym];
597 	    integer iend = inforb_.ibas[isym]+inforb_.nbas[isym];
598 	    for(j=istr; j<iend; j++) {
599 		real fca = drvs.df1000*grid->atv[j]
600 		    +fxa*atvX[j] + fya*atvY[j] + fza*atvZ[j];
601                 real fcb = drvs.df0100*grid->atv[j]
602 		    +fxb*atvX[j] + fyb*atvY[j] + fzb*atvZ[j];
603                 integer joff = j*inforb_.nbast;
604                 for(i=istr; i<iend; i++)
605                     exc->ksma[i+joff] += fca*grid->atv[i];
606                 for(i=istr; i<iend; i++)
607                     exc->ksmb[i+joff] += fcb*grid->atv[i];
608             }
609 	}
610     } else {
611         for(isym=0; isym<inforb_.nsym; isym++) {
612             integer istr = inforb_.ibas[isym];
613             integer iend = inforb_.ibas[isym]+inforb_.nbas[isym];
614             for(j=istr; j<iend; j++) {
615                 real gvxca = 2.0*drvs.df1000*grid->atv[j];
616                 real gvxcb = 2.0*drvs.df0100*grid->atv[j];
617                 integer joff = j*inforb_.nbast;
618                 for(i=istr; i<j; i++)
619                     exc->ksma[i+joff] += gvxca*grid->atv[i];
620                 exc->ksma[j+joff] += 0.5*gvxca*grid->atv[j];
621                 for(i=istr; i<j; i++)
622                     exc->ksmb[i+joff] += gvxcb*grid->atv[i];
623                 exc->ksmb[j+joff] += 0.5*gvxcb*grid->atv[j];
624             }
625         }
626     }
627 }
628 
629 
630 /* ZR alpha/beta version:
631  * we require that dmata and dmatb are aligned consecutively in memory
632  * in order to reduce the MPI latency penalty.
633  */
634 void
FSYM2(dft_kohn_shamab)635 FSYM2(dft_kohn_shamab)(real* dmat, real* ksm, real *edfty,
636 		       real* work, integer *lwork, integer* iprint)
637 {
638     integer nbast2, i, j;
639     DftCallbackData cbdata[1];
640     DftKohnShamU res; /* res like result */
641     DftDensity dens = { dft_dens_unrestricted, NULL, NULL };
642     struct tms starttm, endtm; clock_t utm;
643     real electrons, exp_el = 2.0*inforb_.nrhft+inforb_.nasht;
644     integer sz;
645 
646     /* WARNING: NO work MAY BE done before syncing slaves! */
647     dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_kohn_shamab));/* NO-OP in serial */
648     dft_kohn_shamab_sync_slaves(dmat);                   /* NO-OP in serial */
649 
650     dens.dmata = dmat; dens.dmatb = dmat + inforb_.n2basx;
651     nbast2   = inforb_.nbast*inforb_.nbast;
652     res.ksma = calloc(2*nbast2, sizeof(real));
653     res.ksmb = res.ksma + nbast2;
654     cbdata[0].callback = (DftCallback)kohn_shamab_cb;
655     cbdata[0].cb_data  = &res;
656 
657     times(&starttm);
658     res.energy = 0.0;
659     electrons = dft_integrate_ao(&dens, work, lwork, iprint, 0, 0, 0,
660                                  cbdata, ELEMENTS(cbdata));
661     dft_kohn_shamab_collect_info(res.ksma, &res.energy, work, *lwork);
662 
663     for(i=0; i<inforb_.nbast; i++) {
664 	integer ioff = i*inforb_.nbast;
665 	for(j=0; j<i; j++) {
666 	    integer joff = j*inforb_.nbast;
667 	    real averag = 0.5*(res.ksma[i+joff] + res.ksma[j+ioff]);
668 	    res.ksma[i+joff] = res.ksma[j+ioff] = averag;
669             averag = 0.5*(res.ksmb[i+joff] + res.ksmb[j+ioff]);
670             res.ksmb[i+joff] = res.ksmb[j+ioff] = averag;
671 	}
672     }
673     *edfty=res.energy;
674     sz = 2*inforb_.n2basx;
675     daxpy_(&sz, &ONER, res.ksma, &ONEI, ksm, &ONEI);
676 
677     free(res.ksma);
678     if (*iprint>0) {
679       times(&endtm);
680       utm = endtm.tms_utime-starttm.tms_utime;
681       fort_print("      Electrons: %11.7f %9.1g: Energy %f KS time: %9.1f s",
682                  electrons, (electrons-exp_el)/exp_el,
683                  res.energy, utm/(double)sysconf(_SC_CLK_TCK));
684     }
685 }
686 
687 /* =================================================================== */
688 typedef struct {
689     real* dmata, *dmatb, *kappaa, *kappab, *resa, *resb;
690     real* dtgaoa, *dtgaob;
691     integer trplet, ksymop;
692 } LinRespDataab;
693 
694 
695 #if defined(VAR_MPI)
696 void
dft_lin_respab_slave(real * work,integer * lwork,integer * iprint)697 dft_lin_respab_slave(real* work, integer* lwork, integer* iprint)
698 {
699     real *fmat = calloc(2*inforb_.n2orbx,sizeof(real));            /* OUT */
700     real *cmo  = malloc(inforb_.norbt*inforb_.nbast*sizeof(real)); /* IN  */
701     real *zymat= malloc(2*inforb_.n2orbx*sizeof(real));            /* IN  */
702     integer trplet;                         /* IN: will be synced from master */
703     integer ksymop;                         /* IN: will be synced from master */
704     FSYM2(dft_lin_respab)(fmat, fmat+inforb_.n2orbx, cmo, zymat, &trplet,
705 			  &ksymop, work, lwork, iprint);
706     free(fmat);
707     free(cmo);
708     free(zymat);
709 }
710 
711 static __inline__ void
dft_lin_respab_collect_info(real * fmat,real * work,integer lwork)712 dft_lin_respab_collect_info(real* fmat, real*work, integer lwork)
713 {
714     int sz = 0;
715     MPI_Comm_size(MPI_COMM_WORLD, &sz);
716     if(sz<=1) return;
717 
718     sz = 2*inforb_.n2orbx;
719     integer sz_ftn =sz;
720     CHECK_WRKMEM(sz,lwork);
721     dcopy_(&sz_ftn, fmat, &ONEI, work, &ONEI);
722     MPI_Reduce(work, fmat, sz, MPI_DOUBLE, MPI_SUM,
723 	       MASTER_NO, MPI_COMM_WORLD);
724 }
725 
726 #else  /* VAR_MPI */
727 #define dft_lin_respab_collect_info(fmat,work, lwork)
728 #endif /* VAR_MPI */
729 
730 
731 static void
compute_trans_rho(DftGrid * grid,LinRespDataab * data,real * rhowa,real * rhowb)732 compute_trans_rho(DftGrid* grid, LinRespDataab* data, real* rhowa, real* rhowb)
733 {
734     dgemv_("N",&inforb_.nbast,&inforb_.nbast,&ONER,
735 	   data->kappaa,&inforb_.nbast,grid->atv,&ONEI,&ZEROR,
736 	   data->dtgaoa,&ONEI);
737     *rhowa = ddot_(&inforb_.nbast,data->dtgaoa,&ONEI,grid->atv,&ONEI);
738     dgemv_("N",&inforb_.nbast,&inforb_.nbast,&ONER,
739 	   data->kappab,&inforb_.nbast,grid->atv,&ONEI,&ZEROR,
740 	   data->dtgaob,&ONEI);
741     *rhowb = ddot_(&inforb_.nbast,data->dtgaob,&ONEI,grid->atv,&ONEI);
742 }
743 
744 
745 /* lin_resp_cbab:
746  * ZR alpa/beta version of the linear response evaluation routine.
747  * triplet: hypotetical case of triplet excitations
748  *          should work for general case....       */
749 static void
lin_resp_cbab_gga(DftGrid * grid,LinRespDataab * data)750 lin_resp_cbab_gga(DftGrid* grid, LinRespDataab* data)
751 {
752     /*    real b0a, b0b, b3a[3],b3b[3],gradab; */
753     real rhowa,rhowb, gradwa[3], gradwb[3];
754     integer isym, i, j;
755     integer jsym, istr, iend, jstr, jend, ioff;
756     FunSecondFuncDrv vxc;
757     real zetaa, zetab, zetac;
758     real znva, rxa, rya, rza;
759     real znvb, rxb, ryb, rzb;
760     real fac0, facr, facz;
761     real ar, ab, arb, abw;
762     real g0, gx, gy, gz;
763     real a0, ax, ay, az;
764     real *atvX = &grid->atv[inforb_.nbast];
765     real *atvY = &grid->atv[inforb_.nbast*2];
766     real *atvZ = &grid->atv[inforb_.nbast*3];
767 
768     compute_trans_rho(grid, data, &rhowa, &rhowb);
769      if (data->trplet) rhowb = -rhowb;
770     /* gradwa evaluation */
771     dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX, &inforb_.nbast,
772            data->dtgaoa,&ONEI, &ZEROR, gradwa, &ONEI);
773     dgemv_("T",&inforb_.nbast, &inforb_.nbast, &ONER, data->kappaa,
774            &inforb_.nbast,grid->atv,&ONEI,&ZEROR,data->dtgaoa,&ONEI);
775     dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX,
776            &inforb_.nbast,data->dtgaoa,&ONEI,&ONER,gradwa,&ONEI);
777     /* gradwb evaluation */
778     dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX, &inforb_.nbast,
779            data->dtgaob,&ONEI, &ZEROR, gradwb, &ONEI);
780     dgemv_("T",&inforb_.nbast, &inforb_.nbast, &ONER, data->kappab,
781            &inforb_.nbast,grid->atv,&ONEI,&ZEROR,data->dtgaob,&ONEI);
782     dgemv_("T",&inforb_.nbast,&THREEI,&ONER,atvX,
783            &inforb_.nbast,data->dtgaob,&ONEI,&ONER,gradwb,&ONEI);
784     if  (data->trplet) {
785         gradwb[0]= -gradwb[0];
786         gradwb[1]= -gradwb[1];
787         gradwb[2]= -gradwb[2];
788     }
789     /* second derivatives calculation */
790     drv2_clear(&vxc);
791     selected_func->second(&vxc, grid->curr_weight, &grid->dp);
792     /* alpha coeficients */
793     /* add epsilon to avoid division by zero */
794     znva = 1.0/(fabs(grid->dp.grada)>1e-40 ? grid->dp.grada : 1e-40);
795     vxc.df0010 = znva*vxc.df0010;
796     rxa = znva*grid->grada[0];
797     rya = znva*grid->grada[1];
798     rza = znva*grid->grada[2];
799     /* beta coeficients */
800     znvb = 1.0/(fabs(grid->dp.gradb)>1e-40 ? grid->dp.gradb : 1e-40);
801     vxc.df0001 = znvb*vxc.df0001;
802     rxb = znvb*grid->gradb[0];
803     ryb = znvb*grid->gradb[1];
804     rzb = znvb*grid->gradb[2];
805     /* variations of functionals variables */
806     zetaa = gradwa[0]*rxa + gradwa[1]*rya + gradwa[2]*rza;
807     zetab = gradwb[0]*rxb + gradwb[1]*ryb + gradwb[2]*rzb;
808     zetac = gradwa[0]*grid->gradb[0]+gradwa[1]*grid->gradb[1] +
809         gradwa[2]*grid->gradb[2]+gradwb[0]*grid->grada[0] +
810         gradwb[1]*grid->grada[1]+gradwb[2]*grid->grada[2];
811 
812     /* Derivatives can go in principle to infinity if evaluated at
813      * inapriopriate points. Take precautions. */
814     fac0 = vxc.df1010*zetaa+vxc.df1001*zetab +vxc.df10001*zetac;
815     if(fabs(rhowa)>0) fac0 += vxc.df2000*rhowa;
816     if(fabs(rhowa)>0) fac0 += vxc.df1100*rhowb;
817     facr = vxc.df1010*rhowa+vxc.df0110*rhowb+vxc.df0020*zetaa
818         +vxc.df0011*zetab
819         +vxc.df00101*zetac;
820     /* note: the mixed zetac derivatives not included in facr */
821     facz = vxc.df10001*rhowa+vxc.df01001*rhowb
822          + vxc.df00101*zetaa+vxc.df00011*zetab+vxc.df00002*zetac;
823     /* note: the mixed zetac derivatives not included in facz */
824     for(isym=0; isym<inforb_.nsym; isym++) {
825         istr = inforb_.ibas[isym];
826         iend = inforb_.ibas[isym] + inforb_.nbas[isym];
827         jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
828         if (isym>=jsym) {
829             jstr = inforb_.ibas[jsym];
830             jend = inforb_.ibas[jsym] + inforb_.nbas[jsym];
831             for(i=istr; i<iend; i++) {
832                 g0 = grid->atv[i];
833                 gx = atvX[i];
834                 gy = atvY[i];
835                 gz = atvZ[i];
836                 ioff = i*inforb_.nbast;
837                 for(j=min(i,jend); j>=jstr; j--) {
838                     a0 = g0*grid->atv[j];
839                     ax = gx*grid->atv[j] + g0*atvX[j];
840                     ay = gy*grid->atv[j] + g0*atvY[j];
841                     az = gz*grid->atv[j] + g0*atvZ[j];
842                     ar = ax*rxa + ay*rya + az*rza;
843                     ab = ax*gradwa[0]+ay*gradwa[1]+az*gradwa[2]-ar*zetaa;
844                     arb = ax*grid->gradb[0]+ay*grid->gradb[1]+az*grid->gradb[2];
845                     abw = ax*gradwb[0]+ay*gradwb[1]+az*gradwb[2];
846                     /* triplet:
847                        abw = -(ax*gradwb[0]+ay*gradwb[1]+az*gradwb[2]);
848                     */
849                     data->resa[j+ioff] += 0.5*(fac0*a0+facr*ar+arb*facz+
850                                                vxc.df0010*ab+abw*vxc.df00001);
851                 }
852 	    }
853 	}
854     }
855     /* beta Fock contribution evaluation */
856     fac0 = vxc.df0101*zetab+vxc.df0110*zetaa +vxc.df01001*zetac;
857     if(fabs(rhowb)>0) fac0 += vxc.df0200*rhowb;
858     if(fabs(rhowa)>0) fac0 += vxc.df1100*rhowa;
859     facr = vxc.df0101*rhowb+vxc.df1001*rhowa
860         +vxc.df0002*zetab+vxc.df0011*zetaa+vxc.df00011*zetac;
861     /* note: the mixed zetac derivatives not included in facr */
862     facz = vxc.df10001*rhowa+vxc.df01001*rhowb
863          + vxc.df00011*zetab+vxc.df00101*zetaa+vxc.df00002*zetac;
864     /* note: the mixed zetac derivatives not included in facr */
865     for(isym=0; isym<inforb_.nsym; isym++) {
866         istr = inforb_.ibas[isym];
867         iend = inforb_.ibas[isym] + inforb_.nbas[isym];
868         jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
869         if (isym>=jsym) {
870             jstr = inforb_.ibas[jsym];
871             jend = inforb_.ibas[jsym] + inforb_.nbas[jsym];
872             for(i=istr; i<iend; i++) {
873                 g0 = grid->atv[i];
874                 gx = atvX[i];
875                 gy = atvY[i];
876                 gz = atvZ[i];
877                 ioff = i*inforb_.nbast;
878                 for(j=min(i,jend); j>=jstr; j--) {
879                     a0 = g0*grid->atv[j];
880                     ax = gx*grid->atv[j] + g0*atvX[j];
881                     ay = gy*grid->atv[j] + g0*atvY[j];
882                     az = gz*grid->atv[j] + g0*atvZ[j];
883                     ar = ax*rxb + ay*ryb + az*rzb;
884                     ab = ax*gradwb[0]+ay*gradwb[1]+az*gradwb[2]
885                         -ar*zetab;
886                     arb = ax*grid->grada[0]+ay*grid->grada[1]
887                         +az*grid->grada[2];
888                     abw = ax*gradwa[0]+ay*gradwa[1]+az*gradwa[2];
889                     data->resb[j+ioff] +=
890                         0.5*(fac0*a0+facr*ar+arb*facz+
891                              vxc.df0001*ab+abw*vxc.df00001);
892 		}
893 	    }
894 	}
895     }
896 }
897 
898 /* lin_resp_cbab:
899  * ZR alpa/beta version of the linear response evaluation routine.
900  * triplet: hypotetical case of triplet excitations
901  *          should work for general case....       */
902 static void
lin_resp_cbab_nogga(DftGrid * grid,LinRespDataab * data)903 lin_resp_cbab_nogga(DftGrid* grid, LinRespDataab* data)
904 {
905     /*    real b0a, b0b, b3a[3],b3b[3],gradab; */
906     real rhowa,rhowb;
907     integer isym, i, j;
908     integer jsym, istr, iend, jstr, jend, ioff;
909     FunSecondFuncDrv vxc;
910     real vt, gvi;
911 
912     compute_trans_rho(grid, data, &rhowa, &rhowb);
913     if (data->trplet) rhowb = -rhowb;
914     drv2_clear(&vxc);
915     selected_func->second(&vxc, grid->curr_weight, &grid->dp);
916     /* alpha Fock contribution */
917     /* derivatives can in principle diverge, be cautious! */
918     vt = 0;
919     if(fabs(rhowa)>0) vt += 0.5*vxc.df2000*rhowa;
920     if(fabs(rhowb)>0) vt += 0.5*vxc.df1100*rhowb;
921     if(fabs(vt)>1e-15) {
922         for(isym=0; isym<inforb_.nsym; isym++) {
923             jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
924             if(isym>=jsym) {
925                 istr = inforb_.ibas[isym];
926                 iend = inforb_.ibas[isym] + inforb_.nbas[isym];
927                 jstr = inforb_.ibas[jsym];
928                 jend = inforb_.ibas[jsym] + inforb_.nbas[jsym];
929                 for(i=istr; i<iend; i++) {
930                     ioff = i*inforb_.nbast;
931                     gvi = vt*grid->atv[i];
932                     for(j=min(i,jend-1); j>=jstr; j--)
933                         data->resa[j+ioff] += gvi*grid->atv[j];
934                 }
935             }
936         }
937     }
938     /* beta Fock contribution */
939     /* derivatives can in principle diverge, be cautious! */
940     vt = 0;
941     if(fabs(rhowb)>0) vt += 0.5*vxc.df0200*rhowb;
942     if(fabs(rhowa)>0) vt += 0.5*vxc.df1100*rhowa;
943     if(fabs(vt)>1e-15) {
944         for(isym=0; isym<inforb_.nsym; isym++) {
945             jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
946             if(isym>=jsym) {
947                 istr = inforb_.ibas[isym];
948                 iend = inforb_.ibas[isym] + inforb_.nbas[isym];
949                 jstr = inforb_.ibas[jsym];
950                 jend = inforb_.ibas[jsym] + inforb_.nbas[jsym];
951                 for(i=istr; i<iend; i++) {
952                     ioff = i*inforb_.nbast;
953                     gvi = vt*grid->atv[i];
954                     for(j=min(i,jend-1); j>=jstr; j--)
955                         data->resb[j+ioff] += gvi*grid->atv[j];
956                 }
957             }
958         }
959     }
960 }
961 
962 /* ZR linear response for open shell system:
963    Evaluates DFT contributions:
964    - core fock  matrix      i.e. FCONE
965    - open shell fock matix  i.e  FVONE
966  */
967 
968 void
FSYM2(dft_lin_respab)969 FSYM2(dft_lin_respab)(real* fmatc, real* fmato,  real *cmo, real *zymat,
970 		      integer *trplet, integer *ksymop,
971 		      real* work, integer* lwork, integer* iprint)
972 {
973     static const real DP5R = 0.5;
974     static const real MONER = -1.0;
975     real electrons = 13.0;
976     struct tms starttm, endtm; clock_t utm;
977     LinRespDataab lr_data;
978     DftCallbackData cbdata[1];
979     integer i=1, j, ioff, joff, norbi, norbj;
980     integer nvec1=1;
981     real averag;
982     real *fmata, *fmatb;
983     real *runit;
984     DftDensity dens = { dft_dens_unrestricted, NULL, NULL };
985     integer isym, jsym;
986 
987     /* WARNING: NO work MAY BE done before syncing slaves! */
988     dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_lin_respab)); /* NO-OP in serial */
989     dft_lin_resp_sync_slaves(cmo,&nvec1,&zymat,trplet, ksymop,
990                              &work, *lwork);              /* NO-OP in serial */
991 
992     times(&starttm);
993     /* linear reponse data */
994     dens.dmata = lr_data.dmata = malloc(inforb_.n2basx*sizeof(real));
995     dens.dmatb = lr_data.dmatb = malloc(inforb_.n2basx*sizeof(real));
996     lr_data.resa    = calloc(2*inforb_.n2basx,sizeof(real));
997     lr_data.resb    = lr_data.resa + inforb_.n2basx; /* it's an alias only */
998     lr_data.kappaa  = calloc(inforb_.n2basx,sizeof(real));
999     lr_data.kappab  = calloc(inforb_.n2basx,sizeof(real));
1000     lr_data.dtgaoa  = malloc(inforb_.nbast *sizeof(real));
1001     lr_data.dtgaob  = malloc(inforb_.nbast *sizeof(real));
1002     lr_data.trplet  = *trplet;
1003     lr_data.ksymop  = *ksymop;
1004 
1005     /* get alpha/beta densities and corresponding kappas */
1006 
1007     FSYM2(dft_get_ao_dens_matab)(cmo,lr_data.dmatb,lr_data.dmata,work,lwork);
1008     daxpy_(&inforb_.n2basx,&DP5R,lr_data.dmatb,&ONEI,lr_data.dmata,&ONEI);
1009     dscal_(&inforb_.n2basx,&DP5R,lr_data.dmatb,&ONEI);
1010     runit=calloc(inforb_.n2ashx,sizeof(real));
1011     dunit_(runit,&inforb_.nasht);
1012     FSYM(deq27)(cmo,zymat,runit,lr_data.kappab,lr_data.kappaa,work,lwork);
1013     dscal_(&inforb_.n2basx,&DP5R,lr_data.kappab,&ONEI);
1014     free(runit);
1015     daxpy_(&inforb_.n2basx,&ONER,lr_data.kappab,&ONEI,lr_data.kappaa,&ONEI);
1016 
1017     cbdata[0].callback =
1018         (DftCallback)(selected_func->is_gga() ?
1019                       lin_resp_cbab_gga : lin_resp_cbab_nogga);
1020     cbdata[0].cb_data  = &lr_data;
1021     electrons = dft_integrate_ao(&dens,work, lwork, iprint, 0, 0, 0,
1022                                  cbdata,ELEMENTS(cbdata));
1023 
1024     dft_lin_respab_collect_info(lr_data.resa, work, *lwork);/*serial:NO-OP*/
1025 
1026     for(i=0; i<inforb_.nbast; i++) {
1027 	ioff = i*inforb_.nbast;
1028 	for(j=0; j<i; j++) {
1029 	    joff = j*inforb_.nbast;
1030 	    averag = lr_data.resa[i+joff] + lr_data.resa[j+ioff];
1031 	    lr_data.resa[i+joff] = lr_data.resa[j+ioff] = averag;
1032             averag = lr_data.resb[i+joff] + lr_data.resb[j+ioff];
1033 	    lr_data.resb[i+joff] = lr_data.resb[j+ioff] = averag;
1034 	}
1035     }
1036     /*    fort_print("Ressa: ");
1037     output_(lr_data.resa,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
1038     &inforb_.norbt, &inforb_.norbt, &ONEI, &priunt_.lupri);
1039     fort_print("Ressb");
1040     output_(lr_data.resb,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
1041     &inforb_.norbt, &inforb_.norbt, &ONEI, &priunt_.lupri); */
1042 
1043     fmata=calloc(inforb_.n2orbx,sizeof(real));
1044     fmatb=calloc(inforb_.n2orbx,sizeof(real));
1045     for(isym=1; isym<=inforb_.nsym; isym++) {
1046 	jsym  = inforb_.muld2h[lr_data.ksymop-1][isym-1];
1047         norbi = inforb_.norb[isym-1];
1048 	norbj = inforb_.norb[jsym-1];
1049 	if(norbi>0 && norbj>0)
1050 	    FSYM(autpv)(&isym,&jsym,&cmo[inforb_.icmo[isym-1]],
1051 		   &cmo[inforb_.icmo[jsym-1]], lr_data.resa,
1052 		   inforb_.nbas,&inforb_.nbast,fmata,
1053 		   inforb_.norb,&inforb_.norbt,
1054 		   work, lwork);
1055     }
1056     for(isym=1; isym<=inforb_.nsym; isym++) {
1057 	jsym  = inforb_.muld2h[lr_data.ksymop-1][isym-1];
1058         norbi = inforb_.norb[isym-1];
1059 	norbj = inforb_.norb[jsym-1];
1060 	if(norbi>0 && norbj>0)
1061 	    FSYM(autpv)(&isym,&jsym,&cmo[inforb_.icmo[isym-1]],
1062 		   &cmo[inforb_.icmo[jsym-1]], lr_data.resb,
1063 		   inforb_.nbas,&inforb_.nbast,fmatb,
1064 		   inforb_.norb,&inforb_.norbt,
1065 		   work, lwork);
1066     }
1067     daxpy_(&inforb_.n2orbx,&TWOR,fmata,&ONEI,fmatc,&ONEI);
1068     if (inforb_.nasht > 0) {
1069         if (*trplet) {
1070              daxpy_(&inforb_.n2orbx,&MONER,fmatb,&ONEI,fmato,&ONEI);
1071         } else {
1072              daxpy_(&inforb_.n2orbx,&ONER,fmatb,&ONEI,fmato,&ONEI);
1073         }
1074        daxpy_(&inforb_.n2orbx,&MONER,fmata,&ONEI,fmato,&ONEI);
1075     }
1076     free(fmata);
1077     free(fmatb);
1078     free(lr_data.dmata);
1079     free(lr_data.dmatb);
1080     free(lr_data.resa);
1081     free(lr_data.kappaa);
1082     free(lr_data.kappab);
1083     free(lr_data.dtgaoa);
1084     free(lr_data.dtgaob);
1085     if (*iprint>0) {
1086       times(&endtm);
1087       utm = endtm.tms_utime-starttm.tms_utime;
1088       fort_print("      Electrons: %f(%9.1g): LR-DFT evaluation time: %9.1f s",
1089                  electrons, (double)(electrons-(integer)(electrons+0.5)),
1090                  utm/(double)sysconf(_SC_CLK_TCK));
1091     }
1092 }
1093 
1094 /* =================================================================== */
1095 /*                    BLOCKED PROPERTY EVALUATORS                      */
1096 /* =================================================================== */
1097 static void
distribute_lda_bl(DftIntegratorBl * grid,integer bllen,integer blstart,integer blend,real * RESTRICT tmp,real * RESTRICT dR,real * RESTRICT excmat)1098 distribute_lda_bl(DftIntegratorBl *grid, integer bllen, integer blstart, integer blend,
1099                   real * RESTRICT tmp, real *RESTRICT dR,
1100                   real * RESTRICT excmat)
1101 {
1102     integer isym, jbl, j, ibl, i, k;
1103     real * RESTRICT aos = grid->atv;
1104 
1105     for(isym=0; isym<grid->nsym; isym++) {
1106         integer (*RESTRICT blocks)[2] = BASBLOCK(grid,isym);
1107         integer bl_cnt = grid->bas_bl_cnt[isym];
1108 
1109         for(jbl=0; jbl<bl_cnt; jbl++)
1110             for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1111                 integer joff = j*bllen;
1112                 for(k=blstart; k<blend; k++)
1113                     tmp[k+joff] = aos[k+joff]*dR[k];
1114             }
1115 
1116         for(jbl=0; jbl<bl_cnt; jbl++) {
1117             for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1118                 real *RESTRICT tmpj = tmp + j*bllen; /* jth orbital */
1119                 real *RESTRICT e_jcol = excmat + j*inforb_.nbast;
1120                 for(ibl=0; ibl<bl_cnt; ibl++) {
1121                     integer top = blocks[ibl][1] < j
1122                         ? blocks[ibl][1] : j;
1123                     for(i=blocks[ibl][0]-1; i<top; i++) {
1124                         real *RESTRICT aosi = aos + i*bllen; /* ith orbital */
1125                         for(k=blstart; k<blend; k++)
1126                             e_jcol[i] += aosi[k]*tmpj[k];
1127                     }
1128                 }
1129                 for(k=blstart; k<blend; k++)
1130                     e_jcol[j] += 0.5*aos[k+j*bllen]*tmpj[k];
1131             }
1132         }
1133     }
1134 }
1135 
1136 static void
distribute_gga_bl(DftIntegratorBl * grid,integer bllen,integer blstart,integer blend,real * RESTRICT tmp,real * RESTRICT dR,real * RESTRICT dZ,real * RESTRICT excmat)1137 distribute_gga_bl(DftIntegratorBl *grid, integer bllen, integer blstart, integer blend,
1138                   real * RESTRICT tmp, real *RESTRICT dR, real *RESTRICT dZ,
1139                   real * RESTRICT excmat)
1140 {
1141     integer isym, jbl, j, ibl, i, k;
1142     real * RESTRICT aox = grid->atv+bllen*inforb_.nbast;
1143     real * RESTRICT aoy = grid->atv+bllen*inforb_.nbast*2;
1144     real * RESTRICT aoz = grid->atv+bllen*inforb_.nbast*3;
1145     real * RESTRICT aos = grid->atv;
1146 
1147     for(isym=0; isym<grid->nsym; isym++) {
1148         integer (*RESTRICT blocks)[2] = BASBLOCK(grid,isym);
1149         integer nblocks = grid->bas_bl_cnt[isym];
1150         for(jbl=0; jbl<nblocks; jbl++)
1151             for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1152                 integer joff = j*bllen;
1153                 for(k=0; k<bllen; k++)
1154                     tmp[k+joff] =
1155                         dR[k]* aos[k+j*bllen] +
1156                         dZ[k]*(aox[k+j*bllen]*grid->g.rad.a[k][0]+
1157                                aoy[k+j*bllen]*grid->g.rad.a[k][1]+
1158                                aoz[k+j*bllen]*grid->g.rad.a[k][2]);
1159         }
1160 
1161         for(jbl=0; jbl<nblocks; jbl++) {
1162             for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1163                 real *RESTRICT tmpj = tmp + j*bllen; /* jth orbital */
1164                 real *RESTRICT e_jcol = excmat + j*inforb_.nbast;
1165                 for(ibl=0; ibl<nblocks; ibl++) {
1166                     for(i=blocks[ibl][0]-1; i<blocks[ibl][1]; i++) {
1167                         for(k=0; k<bllen; k++)
1168                             e_jcol[i] += aos[k+i*bllen]*tmpj[k];
1169                     }
1170                 }
1171             }
1172         }
1173     }
1174 }
1175 
1176 /* =================================================================== */
1177 /*                 blocked density and KS evaluation                   */
1178 /* =================================================================== */
1179 
1180 struct ks_data {
1181   real* excmat;
1182   real* dR;
1183   real* dZ;
1184   real energy;
1185 };
1186 static void
kohn_sham_cb_b_lda(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,struct ks_data * data)1187 kohn_sham_cb_b_lda(DftIntegratorBl *grid, real * RESTRICT tmp,
1188                    integer bllen, integer blstart, integer blend,
1189                    struct ks_data* data)
1190 {
1191     FirstDrv drvs;
1192     integer i;
1193     real * RESTRICT excmat = data->excmat;
1194     real * RESTRICT dR     = data->dR;
1195     FunDensProp dp = { 0 };
1196 
1197     assert(grid->ntypso >0);
1198     for(i=blstart; i<bllen; i++) {
1199         real weight = grid->weight[grid->curr_point+i];
1200         dp.rhoa = dp. rhob = 0.5*grid->r.rho[i];
1201         data->energy += selected_func->func(&dp)*weight;
1202         dftpot0_(&drvs, &weight, &dp);
1203         dR[i] = 2*drvs.fR;
1204     }
1205 
1206     distribute_lda_bl(grid, bllen, blstart, blend, tmp, dR, excmat);
1207 }
1208 
1209 static void
kohn_sham_cb_b_gga(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,struct ks_data * data)1210 kohn_sham_cb_b_gga(DftIntegratorBl* grid, real * RESTRICT tmp,
1211                    integer bllen, integer blstart, integer blend,
1212                    struct ks_data* data)
1213 {
1214     FirstDrv drvs;
1215     integer i;
1216     real * RESTRICT excmat = data->excmat;
1217     real * RESTRICT dR = data->dR;
1218     real * RESTRICT dZ = data->dZ;
1219     FunDensProp dp = { 0 };
1220 
1221     assert(grid->ntypso >0);
1222     for(i=0; i<bllen; i++) {
1223         real weight = grid->weight[grid->curr_point+i];
1224         dp.grada = 0.5*sqrt(grid->g.grad[i][0]*grid->g.grad[i][0]+
1225                             grid->g.grad[i][1]*grid->g.grad[i][1]+
1226                             grid->g.grad[i][2]*grid->g.grad[i][2]);
1227         dp. rhoa = dp. rhob = 0.5*grid->r.rho[i];
1228         dp.gradb  = dp.grada;
1229         dp.gradab = dp.grada*dp.gradb;
1230         data->energy += selected_func->func(&dp)*weight;
1231         dftpot0_(&drvs, &weight, &dp);
1232         dR[i] = drvs.fR;
1233         dZ[i] = drvs.fZ/dp.grada;
1234     }
1235 
1236     distribute_gga_bl(grid, bllen, blstart, blend, tmp, dR, dZ, excmat);
1237 }
1238 
1239 /* dft_kohn_sham:
1240    compute Fock matrix ksm corresponding to given density matrix dmat.
1241    fast version - uses memory bandwidth-efficient algorithm.
1242 */
1243 void
FSYM2(dft_kohn_shamf)1244 FSYM2(dft_kohn_shamf)(real* dmat, real* ksm, real* edfty,
1245 		      real* work, integer *lwork, integer* iprint)
1246 {
1247     integer nbast2, i, j;
1248     struct tms starttm, endtm; clock_t utm;
1249     real electrons;
1250     struct ks_data ds;
1251 
1252     nbast2   = inforb_.nbast*inforb_.nbast;
1253     ds.excmat = calloc(nbast2, sizeof(real));
1254     ds.dR     = dal_malloc(DFT_BLLEN*sizeof(real));
1255     ds.dZ     = dal_malloc(DFT_BLLEN*sizeof(real));
1256 
1257     times(&starttm);
1258     ds.energy = 0.0;
1259 
1260     electrons = dft_integrate_ao_bl(1, dmat, work, lwork, iprint, 0,
1261                                     (DftBlockCallback)
1262                                     (selected_func->is_gga() ?
1263                                     kohn_sham_cb_b_gga : kohn_sham_cb_b_lda),
1264                                     &ds);
1265 
1266     for(i=0; i<inforb_.nbast; i++) {
1267 	integer ioff = i*inforb_.nbast;
1268 	for(j=0; j<i; j++) {
1269 	    integer joff = j*inforb_.nbast;
1270 	    real averag = 0.5*(ds.excmat[i+joff] + ds.excmat[j+ioff]);
1271 	    ds.excmat[i+joff] = ds.excmat[j+ioff] = averag;
1272 	}
1273     }
1274 
1275     *edfty=ds.energy;
1276     daxpy_(&inforb_.n2basx, &ONER, ds.excmat, &ONEI, ksm, &ONEI);
1277     if(KOHNSH_DEBUG) {
1278         fort_print("kohn sham matrix");
1279         outmat_(ds.excmat,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
1280                 &inforb_.nbast, &inforb_.nbast);
1281     }
1282 
1283     free(ds.excmat);
1284     free(ds.dR);
1285     free(ds.dZ);
1286     if (*iprint>0) {
1287       times(&endtm);
1288       utm = endtm.tms_utime-starttm.tms_utime;
1289       fort_print("      Electrons: %11.7f %7.1g: Energy %f KS/B time: %9.1f s",
1290                  electrons, (electrons-2.0*inforb_.nrhft)/(2.0*inforb_.nrhft),
1291                  ds.energy, utm/(double)sysconf(_SC_CLK_TCK));
1292     }
1293 }
1294 
1295 /* ------------------------------------------------------------------- */
1296 /* ------ blocked DFT LINEAR RESPONSE CONTRIBUTION EVALUATOR --------- */
1297 /* ------------------------------------------------------------------- */
1298 
1299 typedef struct {
1300     real* dmat, *kappa, *res;
1301     real* dtgao;
1302     real* vt; /* dimensioned [bllen] for LDA, [bllen][4] for GGA */
1303     integer trplet, ksymop;
1304     integer vecs_in_batch;
1305 } LinRespBlData;
1306 
1307 static void
lin_resp_cb_b_lda(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,LinRespBlData * data)1308 lin_resp_cb_b_lda(DftIntegratorBl* grid, real * RESTRICT tmp,
1309                   integer bllen, integer blstart, integer blend,
1310                   LinRespBlData* data)
1311 {
1312     real * RESTRICT aos = grid->atv;
1313     real * RESTRICT excmat = data->res;
1314     real (* RESTRICT vt) = data->vt; /* [bllen][4] */
1315     integer ibl, i, jbl, j, k, isym, ivec;
1316     FunDensProp dp = { 0 };
1317     integer blen = bllen;
1318 
1319     for(ivec=0; ivec<data->vecs_in_batch; ivec++) {
1320         /* compute vector of transformed densities vt */
1321         FSYM2(getexp_blocked_lda)(&data->ksymop, data->kappa + ivec*inforb_.n2basx,
1322                                   grid->atv, grid->bas_bl_cnt, grid->basblocks,
1323                                   &grid->shl_bl_cnt, tmp, &blen, vt);
1324 
1325         for(i=blstart; i<blend; i++) {
1326             SecondDrv vxc;
1327             real weight = grid->weight[grid->curr_point+i];
1328             dp.rhoa = dp.rhob = 0.5*grid->r.rho[i];
1329             dftpot1_(&vxc, &weight, &dp, &data->trplet);
1330             vt[i] = vxc.fRR*vt[i]*2;
1331         }
1332 
1333         for(isym=0; isym<grid->nsym; isym++) {
1334             integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
1335             integer ibl_cnt = grid->bas_bl_cnt[isym];
1336 
1337             for(ibl=0; ibl<ibl_cnt; ibl++)
1338                 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
1339                     integer ioff = i*bllen;
1340                     for(k=blstart; k<blend; k++)
1341                     tmp[k+ioff] = aos[k+ioff]*vt[k];
1342                 }
1343 
1344             for(ibl=0; ibl<ibl_cnt; ibl++) {
1345                 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
1346                     integer ioff = i*inforb_.nbast + ivec*inforb_.n2basx;
1347                     integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
1348                     integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
1349                     integer jbl_cnt = grid->bas_bl_cnt[jsym];
1350                     real *RESTRICT tmpi = &tmp[i*bllen];
1351                     if (isym<jsym) continue;
1352                     for(jbl=0; jbl<jbl_cnt; jbl++) {
1353                         integer jtop = min(jblocks[jbl][1],i);
1354                         for(j=jblocks[jbl][0]-1; j<jtop; j++) {
1355                             for(k=blstart; k<blend; k++)
1356                                 excmat[j+ioff] +=
1357                                     aos[k+j*bllen]*tmpi[k];
1358                         }
1359                     }
1360                     for(k=blstart; k<blend; k++)
1361                         excmat[i+ioff] += aos[k+i*bllen]*tmpi[k]*0.5;
1362                 }
1363             }
1364         }
1365     }
1366 }
1367 
1368 static void
lin_resp_cb_b_gga(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,LinRespBlData * data)1369 lin_resp_cb_b_gga(DftIntegratorBl* grid, real * RESTRICT tmp,
1370                   integer bllen, integer blstart, integer blend,
1371                   LinRespBlData* data)
1372 {
1373     integer ibl, i, jbl, j, k, isym, ivec;
1374     real (* RESTRICT vt3)[4] = (real(*)[4])data->vt; /* [bllen][4] */
1375     real * RESTRICT aos = grid->atv;
1376     real * RESTRICT aox = grid->atv+bllen*inforb_.nbast;
1377     real * RESTRICT aoy = grid->atv+bllen*inforb_.nbast*2;
1378     real * RESTRICT aoz = grid->atv+bllen*inforb_.nbast*3;
1379     real * RESTRICT excmat = data->res;
1380     FunDensProp dp = { 0 };
1381     integer blen = bllen;
1382 
1383     for(ivec=0; ivec<data->vecs_in_batch; ivec++) {
1384         /* compute vector of transformed densities and dens. gradients vt3 */
1385         FSYM2(getexp_blocked_gga)(&data->ksymop,data->kappa + ivec*inforb_.n2basx,
1386                                   grid->atv, grid->bas_bl_cnt,
1387                                   grid->basblocks, &grid->shl_bl_cnt, tmp,
1388 				  &blen, vt3);
1389         for(i=blstart; i<blend; i++) {
1390             SecondDrv vxc;
1391             real facr, facg;
1392             real weight = grid->weight[grid->curr_point+i];
1393             real ngrad  = sqrt(grid->g.grad[i][0]*grid->g.grad[i][0]+
1394                                grid->g.grad[i][1]*grid->g.grad[i][1]+
1395                                grid->g.grad[i][2]*grid->g.grad[i][2]);
1396             real brg, brz, b0 = vt3[i][0];
1397             if(ngrad<1e-15|| grid->r.rho[i]<1e-15) {
1398                 vt3[i][0] = vt3[i][1] = vt3[i][2] = vt3[i][3] = 0;
1399                 continue;
1400             }
1401             brg = (vt3[i][1]*grid->g.grad[i][0] +
1402                    vt3[i][2]*grid->g.grad[i][1] +
1403                    vt3[i][3]*grid->g.grad[i][2]);
1404             brz = brg/ngrad;
1405             dp. rhoa = dp. rhob = 0.5*grid->r.rho[i];
1406             dp.grada = dp.gradb = 0.5*ngrad;
1407             dp.gradab = dp.grada*dp.gradb;
1408             dftpot1_(&vxc, &weight, &dp, &data->trplet);
1409             facr = vxc.fRZ*b0 + (vxc.fZZ-vxc.fZ/ngrad)*brz + vxc.fZG*brg;
1410             facr = facr/ngrad + (vxc.fRG*b0+vxc.fZG*brz +vxc.fGG*brg);
1411             facg = vxc.fZ/ngrad + vxc.fG;
1412             vt3[i][0] = vxc.fRR*b0 + vxc.fRZ*brz+ vxc.fRG*brg;
1413             vt3[i][1] = (grid->g.grad[i][0]*facr + facg*vt3[i][1])*2;
1414             vt3[i][2] = (grid->g.grad[i][1]*facr + facg*vt3[i][2])*2;
1415             vt3[i][3] = (grid->g.grad[i][2]*facr + facg*vt3[i][3])*2;
1416         }
1417 
1418         for(isym=0; isym<grid->nsym; isym++) {
1419             integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
1420             integer ibl_cnt = grid->bas_bl_cnt[isym];
1421 
1422             for(ibl=0; ibl<ibl_cnt; ibl++) {
1423                 for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
1424                     real *RESTRICT g0i = &aos[i*bllen];
1425                     real *RESTRICT gxi = &aox[i*bllen];
1426                     real *RESTRICT gyi = &aoy[i*bllen];
1427                     real *RESTRICT gzi = &aoz[i*bllen];
1428                     integer ioff = i*inforb_.nbast + ivec*inforb_.n2basx;
1429                     integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
1430                     integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
1431                     integer jbl_cnt = grid->bas_bl_cnt[jsym];
1432                     for(k=blstart; k<blend; k++)
1433                         tmp[k] = (vt3[k][0]*g0i[k] +
1434                                   vt3[k][1]*gxi[k] +
1435                                   vt3[k][2]*gyi[k] +
1436                                   vt3[k][3]*gzi[k]);
1437                     for(jbl=0; jbl<jbl_cnt; jbl++) {
1438                         integer jtop = jblocks[jbl][1];
1439                         for(j=jblocks[jbl][0]-1; j<jtop; j++) {
1440                             real *RESTRICT g0j = &aos[j*bllen];
1441                             integer bllen = blend - blstart;
1442                             real s = ddot_(&bllen, g0j+blstart, &ONEI, tmp+blstart, &ONEI);
1443                             excmat[j+ioff] += s;
1444                         }
1445                     }
1446                 }
1447             }
1448         }
1449     }
1450 }
1451 
1452 /* dft_lin_respf_:
1453    Main Linear Response evaluator.
1454    FMAT(NORBT,NORBT,NOSIM) - result added to FMAT. Must not be referenced on slaves.
1455    cmo -const
1456    zymat(NORBT,NORBT,NOSIM) - const response vector.
1457    trplet - triplet excitation? (bool)
1458    NOSIM - number of simultaneously transformed response vectors.
1459 */
1460 void
FSYM2(dft_lin_respf)1461 FSYM2(dft_lin_respf)(integer *nosim, real* fmat, real *cmo, real *zymat,
1462 		     integer *trplet, integer *ksymop,
1463 		     real* work, integer* lwork, integer* iprint)
1464 {
1465     /* MAX_VEC determines the number of simultaneusly transformed
1466        vectors.  Set it to a small mumber (eg. 3) - the only operation
1467        that is saved is evaluation of orbitals and density and density
1468        gradients which is usually small compared to the time spent in
1469        actual vector transformation. Larger values will only increate
1470        memory utilization without positive impact on performance.
1471        FIXME: consider using work for this purpose. */
1472     static const integer MAX_VEC = 5;
1473     static const real DP5R = 0.5;
1474     struct tms starttm, endtm; clock_t utm;
1475     real electrons = 0;
1476     LinRespBlData lr_data; /* linear response data */
1477     real dummy;
1478     integer i, j, ivec, jvec;
1479     integer max_vecs;
1480 
1481     /* WARNING: NO work MAY BE done before syncing slaves! */
1482     dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_lin_respf)); /* NO-OP in serial */
1483     dft_lin_resp_sync_slaves(cmo,nosim,&zymat,trplet,ksymop,
1484                              &work, *lwork);              /* NO-OP in serial */
1485 
1486     times(&starttm);
1487     max_vecs = *nosim>MAX_VEC ? MAX_VEC : *nosim;
1488     lr_data.dmat  = dal_malloc(inforb_.n2basx*sizeof(real));
1489     lr_data.res   = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
1490     lr_data.kappa = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
1491     lr_data.vt    = dal_malloc(DFT_BLLEN*4   *sizeof(real));
1492     lr_data.dtgao = dal_malloc(inforb_.nbast *sizeof(real));
1493     lr_data.trplet= *trplet;
1494     lr_data.ksymop= *ksymop;
1495     FSYM2(dft_get_ao_dens_mat)(cmo, lr_data.dmat, work, lwork);
1496 
1497     for(ivec=0; ivec<*nosim; ivec+=max_vecs) {
1498         integer sz;
1499         lr_data.vecs_in_batch = ivec + max_vecs > *nosim ? *nosim - ivec : max_vecs;
1500         sz = lr_data.vecs_in_batch * inforb_.n2basx;
1501         FSYM(dzero)(lr_data.kappa, &sz);
1502         for(jvec=0; jvec<lr_data.vecs_in_batch; jvec++){
1503             FSYM(deq27)(cmo,zymat+(ivec+jvec)*inforb_.n2orbx,&dummy,
1504                         lr_data.kappa+jvec*inforb_.n2basx, &dummy,work,lwork);
1505             dscal_(&inforb_.n2basx,&DP5R,lr_data.kappa+jvec*inforb_.n2basx,&ONEI);
1506         }
1507         FSYM(dzero)(lr_data.res, &sz);
1508         electrons = dft_integrate_ao_bl(1, lr_data.dmat, work, lwork, iprint, 0,
1509                                         (DftBlockCallback)
1510                                         (selected_func->is_gga() ?
1511                                          lin_resp_cb_b_gga : lin_resp_cb_b_lda),
1512                                         &lr_data);
1513 #ifdef VAR_MPI
1514         dft_lin_resp_collect_info(lr_data.vecs_in_batch,lr_data.res, work, *lwork);
1515 	if(fmat == NULL)  /* The transformations below are done only by master. */
1516 	    continue;
1517 #endif
1518         if(DFTLR_DEBUG) {
1519             fort_print("AO Fock matrix contribution in dft_lin_respf");
1520             outmat_(lr_data.res,&ONEI,&inforb_.nbast,&ONEI,&inforb_.nbast,
1521                     &inforb_.nbast, &inforb_.nbast);
1522         }
1523         for(jvec=0; jvec<lr_data.vecs_in_batch; jvec++){
1524             for(i=0; i<inforb_.nbast; i++) {
1525                 integer ioff = i*inforb_.nbast + jvec*inforb_.n2basx;
1526                 for(j=0; j<i; j++) {
1527                     integer joff = j*inforb_.nbast + jvec*inforb_.n2basx;
1528                     real averag = 0.5*(lr_data.res[i+joff] +
1529                                        lr_data.res[j+ioff]);
1530                     lr_data.res[i+joff] = lr_data.res[j+ioff] = averag;
1531                 }
1532             }
1533             /* transform to MO Fock matrix contribution  */
1534             FSYM(lrao2mo)(cmo, &lr_data.ksymop, lr_data.res+jvec*inforb_.n2basx,
1535                           fmat+(ivec+jvec)*inforb_.n2orbx, work, lwork);
1536         }
1537         if(DFTLR_DEBUG) {
1538             fort_print("MO Fock matrix contribution in dft_lin_resp");
1539             outmat_(fmat,&ONEI,&inforb_.norbt,&ONEI,&inforb_.norbt,
1540                     &inforb_.norbt, &inforb_.norbt);
1541         }
1542     }
1543 
1544     free(lr_data.dmat);
1545     free(lr_data.res);
1546     free(lr_data.kappa);
1547     free(lr_data.vt);
1548     free(lr_data.dtgao);
1549     if (*iprint>0) {
1550       times(&endtm);
1551       utm = endtm.tms_utime-starttm.tms_utime;
1552       fort_print("      Electrons: %f(%9.3g): LR-DFT*%d evaluation time: %9.1f s",
1553                  electrons, (double)(electrons-(integer)(electrons+0.5)), *nosim,
1554                  utm/(double)sysconf(_SC_CLK_TCK));
1555     }
1556 }
1557 
1558 void
dft_lin_respf_slave(real * work,integer * lwork,integer * iprint)1559 dft_lin_respf_slave(real* work, integer* lwork, integer* iprint)
1560 {
1561     real *cmo  = malloc(inforb_.norbt*inforb_.nbast*sizeof(real)); /* IN  */
1562     integer trplet;                     /* IN: will be synced from master */
1563     integer ksymop, nosim;
1564     FSYM2(dft_lin_respf)(&nosim, NULL, cmo, NULL,
1565                          &trplet, &ksymop, work, lwork, iprint);
1566     free(cmo);
1567 }
1568 
1569 
1570 /*====================================================================*/
1571 /*                     BLOCKED OPEN-SHELL CODE                        */
1572 /*                   Z. Rinkevicius, June 2008                        */
1573 /*====================================================================*/
1574 
1575 /*                    Kohn-Sham evaluator                             */
1576 
1577 struct ks_data_ab {
1578   real *ksma, *ksmb;
1579   real energy;
1580   real *dRa;
1581   real *dRb;
1582   real (*dZa);
1583   real (*dZb);
1584   real (*dZab);
1585   real *tmpa, *tmpb;
1586 };
1587 
1588 
1589 #if defined(VAR_MPI)
1590 void
dft_kohn_shamab_b_slave(real * work,integer * lwork,integer * iprint)1591 dft_kohn_shamab_b_slave(real* work, integer* lwork, integer* iprint)
1592 {
1593   real* dmat = malloc(2*inforb_.n2basx*sizeof(real));
1594   FSYM2(dft_kohn_shamab_b)(dmat, NULL, NULL, work, lwork, iprint);
1595   free(dmat);
1596 }
1597 
1598 extern void FSYM(dftintbcast)(void);
1599 
1600 void
dft_kohn_shamab_b_sync_slaves(real * dmat)1601 dft_kohn_shamab_b_sync_slaves(real* dmat)
1602 {
1603   FSYM(dftintbcast)();
1604   MPI_Bcast(dmat,  2*inforb_.n2basx,MPI_DOUBLE,
1605             MASTER_NO, MPI_COMM_WORLD);
1606 }
1607 
1608 static void
dft_kohn_shamab_b_collect(real * energy,real * ksm,real * work,integer lwork)1609 dft_kohn_shamab_b_collect(real *energy, real *ksm, real* work, integer lwork)
1610 {
1611   real tmp = *energy;
1612   int sz = 0;
1613   MPI_Comm_size(MPI_COMM_WORLD, &sz);
1614   if(sz <=1) return;
1615   sz = 2*inforb_.n2basx;
1616   integer sz_ftn = sz;
1617   CHECK_WRKMEM(sz, lwork);
1618   dcopy_(&sz_ftn, ksm,&ONEI, work, &ONEI);
1619   MPI_Reduce(work, ksm, sz, MPI_DOUBLE, MPI_SUM,
1620              MASTER_NO, MPI_COMM_WORLD);
1621   MPI_Reduce(&tmp, energy,  1, MPI_DOUBLE, MPI_SUM,
1622              MASTER_NO, MPI_COMM_WORLD);
1623 }
1624 #endif
1625 
1626 static void
distribute_lda_ab_bl(DftIntegratorBl * grid,integer bllen,integer blstart,integer blend,real * RESTRICT tmpa,real * RESTRICT tmpb,real * RESTRICT dRa,real * RESTRICT dRb,real * RESTRICT exc_a,real * RESTRICT exc_b)1627 distribute_lda_ab_bl(DftIntegratorBl *grid, integer bllen, integer blstart, integer blend,
1628                      real * RESTRICT tmpa,real * RESTRICT tmpb,
1629                      real *RESTRICT dRa, real *RESTRICT dRb,
1630                      real * RESTRICT exc_a, real * RESTRICT exc_b)
1631 {
1632   integer isym, jbl, j, ibl, i, k;
1633   real * RESTRICT aos = grid->atv;
1634 
1635   for(isym=0; isym<grid->nsym; isym++) {
1636     integer (*RESTRICT blocks)[2] = BASBLOCK(grid,isym);
1637     integer bl_cnt = grid->bas_bl_cnt[isym];
1638 
1639     for(jbl=0; jbl<bl_cnt; jbl++)
1640       for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1641         integer joff = j*bllen;
1642         real *RESTRICT e_jcola = exc_a+ j*inforb_.nbast;
1643         real *RESTRICT e_jcolb = exc_b+ j*inforb_.nbast;
1644         for(k=blstart; k<blend; k++) {
1645           tmpa[k] = aos[k+joff]*dRa[k];
1646           tmpb[k] = aos[k+joff]*dRb[k];
1647         }
1648         for(ibl=0; ibl<bl_cnt; ibl++) {
1649                     integer top = blocks[ibl][1] < j
1650                       ? blocks[ibl][1] : j;
1651                     for(i=blocks[ibl][0]-1; i<top; i++) {
1652                       real *RESTRICT aosi = aos + i*bllen; /* ith orbital */
1653                       for(k=blstart; k<blend; k++) {
1654                         e_jcola[i] += aosi[k]*tmpa[k];
1655                         e_jcolb[i] += aosi[k]*tmpb[k];
1656                       }
1657                     }
1658         }
1659         for(k=blstart; k<blend; k++) {
1660           e_jcola[j] += 0.5*aos[k+j*bllen]*tmpa[k];
1661           e_jcolb[j] += 0.5*aos[k+j*bllen]*tmpb[k];
1662         }
1663       }
1664   }
1665 }
1666 
1667 static void
distribute_gga_ab_bl(DftIntegratorBl * grid,integer bllen,integer blstart,integer blend,real * RESTRICT tmpa,real * RESTRICT tmpb,real * RESTRICT dRa,real * RESTRICT dRb,real * RESTRICT dZa,real * RESTRICT dZb,real * RESTRICT dZab,real * RESTRICT exc_a,real * RESTRICT exc_b)1668 distribute_gga_ab_bl(DftIntegratorBl *grid, integer bllen, integer blstart, integer blend,
1669                      real * RESTRICT tmpa, real * RESTRICT tmpb,
1670                      real *RESTRICT dRa, real *RESTRICT dRb,
1671                      real *RESTRICT dZa, real *RESTRICT dZb,
1672                      real *RESTRICT dZab,
1673                      real * RESTRICT exc_a, real * RESTRICT exc_b)
1674 {
1675   integer isym, jbl, j, ibl, i, k;
1676   real * RESTRICT aox = grid->atv+bllen*inforb_.nbast;
1677   real * RESTRICT aoy = grid->atv+bllen*inforb_.nbast*2;
1678   real * RESTRICT aoz = grid->atv+bllen*inforb_.nbast*3;
1679   real * RESTRICT aos = grid->atv;
1680 
1681   for(isym=0; isym<grid->nsym; isym++) {
1682     integer (*RESTRICT blocks)[2] = BASBLOCK(grid,isym);
1683     integer nblocks = grid->bas_bl_cnt[isym];
1684     for(jbl=0; jbl<nblocks; jbl++)
1685       for(j=blocks[jbl][0]-1; j<blocks[jbl][1]; j++) {
1686         integer joff = j*bllen;
1687         real *RESTRICT e_jcola = exc_a + j*inforb_.nbast;
1688         real *RESTRICT e_jcolb = exc_b + j*inforb_.nbast;
1689         for(k=blstart; k<blend; k++) {
1690           tmpa[k] =
1691             dRa[k]* aos[k+joff]+
1692             dZa[k]*(aox[k+joff]*grid->g.rad.a[k][0]+
1693                     aoy[k+joff]*grid->g.rad.a[k][1]+
1694                     aoz[k+joff]*grid->g.rad.a[k][2])+
1695             dZab[k]*(aox[k+joff]*grid->g.rad.b[k][0]+
1696                      aoy[k+joff]*grid->g.rad.b[k][1]+
1697                      aoz[k+joff]*grid->g.rad.b[k][2]);
1698           tmpb[k] =
1699             dRb[k]* aos[k+joff]+
1700             dZb[k]*(aox[k+joff]*grid->g.rad.b[k][0]+
1701                     aoy[k+joff]*grid->g.rad.b[k][1]+
1702                     aoz[k+joff]*grid->g.rad.b[k][2])+
1703             dZab[k]*(aox[k+joff]*grid->g.rad.a[k][0]+
1704                      aoy[k+joff]*grid->g.rad.a[k][1]+
1705                      aoz[k+joff]*grid->g.rad.a[k][2]);
1706         }
1707         for(ibl=0; ibl<nblocks; ibl++) {
1708           for(i=blocks[ibl][0]-1; i<blocks[ibl][1]; i++) {
1709             for(k=blstart; k<blend; k++) {
1710               e_jcola[i] += aos[k+i*bllen]*tmpa[k];
1711               e_jcolb[i] += aos[k+i*bllen]*tmpb[k];
1712             }
1713           }
1714         }
1715       }
1716   }
1717 }
1718 
1719 static void
kohn_shamab_cb_b_lda(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,struct ks_data_ab * data)1720 kohn_shamab_cb_b_lda(DftIntegratorBl *grid, real * RESTRICT tmp,
1721                      integer bllen, integer blstart, integer blend,
1722                      struct ks_data_ab* data)
1723 {
1724 
1725   FunFirstFuncDrv drvs;
1726   integer i;
1727   real * RESTRICT exc_a  = data->ksma;
1728   real * RESTRICT exc_b  = data->ksmb;
1729   real * RESTRICT dRa    = data->dRa;
1730   real * RESTRICT dRb    = data->dRb;
1731   FunDensProp dp = { 0 };
1732 
1733   assert(grid->ntypso >0);
1734   for(i=blstart; i<blend; i++) {
1735     real weight = grid->weight[grid->curr_point+i];
1736     dp.rhoa = grid->r.ho.a[i];
1737     dp.rhob = grid->r.ho.b[i];
1738     data->energy += selected_func->func(&dp)*weight;
1739     drv1_clear(&drvs);
1740     selected_func->first(&drvs, weight, &dp);
1741     dRa[i] = 2.0*drvs.df1000;
1742     dRb[i] = 2.0*drvs.df0100;
1743   }
1744   distribute_lda_ab_bl(grid, bllen, blstart, blend, data->tmpa, data->tmpb,
1745                        dRa, dRb,  exc_a, exc_b);
1746 }
1747 
1748 static void
kohn_shamab_cb_b_gga(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,struct ks_data_ab * data)1749 kohn_shamab_cb_b_gga(DftIntegratorBl* grid, real * RESTRICT tmp,
1750                      integer bllen, integer blstart, integer blend,
1751                      struct ks_data_ab* data)
1752 {
1753   FunFirstFuncDrv drvs;
1754   integer i;
1755   real * RESTRICT exc_a   = data->ksma;
1756   real * RESTRICT exc_b   = data->ksmb;
1757   real * RESTRICT dRa     = data->dRa;
1758   real * RESTRICT dRb     = data->dRb;
1759   real * RESTRICT dZa     = data->dZa;
1760   real * RESTRICT dZb     = data->dZb;
1761   real * RESTRICT dZab    = data->dZab;
1762   FunDensProp dp = { 0 };
1763 
1764   assert(grid->ntypso >0);
1765   for(i=blstart; i<blend; i++) {
1766     real weight = grid->weight[grid->curr_point+i];
1767     dp.rhoa = grid->r.ho.a[i];
1768     dp.rhob = grid->r.ho.b[i];
1769     dp.grada = sqrt(grid->g.rad.a [i][0]*grid->g.rad.a [i][0]+
1770                     grid->g.rad.a [i][1]*grid->g.rad.a [i][1]+
1771                     grid->g.rad.a [i][2]*grid->g.rad.a [i][2]);
1772     dp.grada = fabs(dp.grada)>1e-40 ? dp.grada : 1e-40;
1773     dp.gradb = sqrt(grid->g.rad.b [i][0]*grid->g.rad.b [i][0]+
1774                     grid->g.rad.b [i][1]*grid->g.rad.b [i][1]+
1775                     grid->g.rad.b [i][2]*grid->g.rad.b [i][2]);
1776     dp.gradb  = fabs(dp.gradb)>1e-40 ? dp.gradb : 1e-40;
1777     dp.gradab = grid->g.rad.a [i][0]*grid->g.rad.b [i][0]+
1778                 grid->g.rad.a [i][1]*grid->g.rad.b [i][1]+
1779       grid->g.rad.a [i][2]*grid->g.rad.b [i][2];
1780     data->energy += selected_func->func(&dp)*weight;
1781     drv1_clear(&drvs);
1782     selected_func->first(&drvs, weight, &dp);
1783     dRa[i]    = drvs.df1000;
1784     dRb[i]    = drvs.df0100;
1785     dZa[i]    = 2.0*drvs.df0010/dp.grada;
1786     dZb[i]    = 2.0*drvs.df0001/dp.gradb;
1787     dZab[i]   = 2.0*drvs.df00001;
1788   }
1789   distribute_gga_ab_bl(grid, bllen, blstart, blend, data->tmpa, data->tmpb,
1790                        dRa, dRb, dZa, dZb, dZab, exc_a, exc_b);
1791 }
1792 
1793 void
FSYM2(dft_kohn_shamab_b)1794 FSYM2(dft_kohn_shamab_b)(real* dmat, real* ksm, real *edfty,
1795                          real* work, integer *lwork, integer* iprint)
1796 {
1797   integer i, j;
1798   integer sz;
1799   struct ks_data_ab result; /* res like result */
1800   struct tms starttm, endtm; clock_t utm;
1801   real electrons, exp_el = 2.0*inforb_.nrhft+inforb_.nasht;
1802 
1803   /* parallel code insert */
1804 #if defined(VAR_MPI)
1805   dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_kohn_shamab_b));
1806   dft_kohn_shamab_b_sync_slaves(dmat);
1807 #endif
1808   /* memory allocation */
1809   result.ksma = calloc(2*inforb_.n2basx, sizeof(real));
1810   result.ksmb = result.ksma + inforb_.n2basx;
1811   result.dRa     = dal_malloc(DFT_BLLEN*sizeof(real));
1812   result.dRb     = dal_malloc(DFT_BLLEN*sizeof(real));
1813   result.dZa     = dal_malloc(DFT_BLLEN*sizeof(real));
1814   result.dZb     = dal_malloc(DFT_BLLEN*sizeof(real));
1815   result.dZab    = dal_malloc(DFT_BLLEN*sizeof(real));
1816   result.tmpa    = dal_malloc(DFT_BLLEN*sizeof(real));
1817   result.tmpb    = dal_malloc(DFT_BLLEN*sizeof(real));
1818   result.energy = 0.0;
1819   /* start timer*/
1820   times(&starttm);
1821   /* integeration */
1822   electrons = dft_integrate_ao_bl(2, dmat, work, lwork, iprint, 0,
1823                                   (DftBlockCallback)(selected_func->is_gga() ?
1824 						     kohn_shamab_cb_b_gga:kohn_shamab_cb_b_lda),
1825                                   &result);
1826   /* parallel code insert */
1827 #if defined(VAR_MPI)
1828   /* collect data */
1829   dft_kohn_shamab_b_collect(&result.energy,result.ksma, work, *lwork);
1830   /* free memory and leave if slave*/
1831   if (edfty == NULL) {
1832     free(result.ksma);
1833     free(result.dRa);
1834     free(result.dRb);
1835     free(result.dZa);
1836     free(result.dZb);
1837     free(result.dZab);
1838     free(result.tmpa);
1839     free(result.tmpb);
1840     return;
1841   };
1842 #endif
1843   /* average Kohn-Sham matrices */
1844   for(i=0; i<inforb_.nbast; i++) {
1845     integer ioff = i*inforb_.nbast;
1846     for(j=0; j<i; j++) {
1847       integer joff = j*inforb_.nbast;
1848       real averag = 0.5*(result.ksma[i+joff] + result.ksma[j+ioff]);
1849       result.ksma[i+joff] = result.ksma[j+ioff] = averag;
1850       averag = 0.5*(result.ksmb[i+joff] + result.ksmb[j+ioff]);
1851       result.ksmb[i+joff] = result.ksmb[j+ioff] = averag;
1852     }
1853   }
1854   /* results */
1855   *edfty=result.energy;
1856   sz = 2*inforb_.n2basx;
1857   FSYM(daxpy)(&sz, &ONER, result.ksma, &ONEI, ksm, &ONEI);
1858   /* free memory */
1859   free(result.ksma);
1860   free(result.dRa);
1861   free(result.dRb);
1862   free(result.dZa);
1863   free(result.dZb);
1864   free(result.dZab);
1865   free(result.tmpa);
1866   free(result.tmpb);
1867   /* timings & print out*/
1868   if (*iprint>0) {
1869     times(&endtm);
1870     utm = endtm.tms_utime-starttm.tms_utime;
1871     fort_print("      Electrons: %11.7f %9.1g: Energy %f KS-AB/B time: %9.1f s",
1872                electrons, (electrons-exp_el)/exp_el,
1873                result.energy, utm/(double)sysconf(_SC_CLK_TCK));
1874   }
1875 }
1876 
1877 /* Linear response contribution evaluator */
1878 
1879 typedef struct {
1880   real* dmata, *dmatb;
1881   real* kappa_a, *kappa_b;
1882   real* res_a, *res_b;
1883   real* dtgaoa, *dtgaob;
1884   real* rhowa, *rhowb;
1885   real* vta, *vtb;
1886   real* tmpa, *tmpb;
1887   integer trplet, ksymop, vecs_in_batch;
1888 } LinResp_ab_BlData;
1889 
1890 
1891 #if defined(VAR_MPI)
1892 void
dft_lin_respab_b_slave(real * work,integer * lwork,integer * iprint)1893 dft_lin_respab_b_slave(real* work, integer* lwork, integer* iprint)
1894 {
1895   real *cmo  = malloc(inforb_.norbt*inforb_.nbast*sizeof(real));
1896   integer trplet;
1897   integer ksymop, nosim;
1898   FSYM2(dft_lin_respab_b)(&nosim, NULL, NULL, cmo, NULL,
1899 			  &trplet, &ksymop, work, lwork, iprint);
1900   free(cmo);
1901 }
1902 
1903 static  void
dft_lin_respab_b_collect(integer nvec,real * fmata,real * fmatb,real * work,integer lwork)1904 dft_lin_respab_b_collect(integer nvec, real* fmata, real* fmatb,  real *work, integer lwork)
1905 {
1906   int sz = 0;
1907   MPI_Comm_size(MPI_COMM_WORLD, &sz);
1908   if(sz<=1) return;
1909 
1910   sz = nvec*inforb_.n2basx;
1911   integer sz_ftn =sz;
1912   if(sz>lwork) {
1913     CHECK_WRKMEM(inforb_.n2basx,lwork);
1914     for(sz=0; sz<nvec; sz++) {
1915       dcopy_(&inforb_.n2basx, fmata + sz*inforb_.n2basx,
1916              &ONEI, work, &ONEI);
1917       MPI_Reduce(work, fmata+sz*inforb_.n2basx, inforb_.n2basx,
1918                  MPI_DOUBLE, MPI_SUM, MASTER_NO, MPI_COMM_WORLD);
1919     }
1920     for(sz=0; sz<nvec; sz++) {
1921       dcopy_(&inforb_.n2basx, fmatb + sz*inforb_.n2basx,
1922              &ONEI, work, &ONEI);
1923       MPI_Reduce(work, fmatb+sz*inforb_.n2basx, inforb_.n2basx,
1924                  MPI_DOUBLE, MPI_SUM, MASTER_NO, MPI_COMM_WORLD);
1925     }
1926   } else {
1927     CHECK_WRKMEM(sz, lwork);
1928     dcopy_(&sz_ftn, fmata, &ONEI, work, &ONEI);
1929     MPI_Reduce(work, fmata, sz, MPI_DOUBLE, MPI_SUM,
1930                MASTER_NO, MPI_COMM_WORLD);
1931     dcopy_(&sz_ftn, fmatb, &ONEI, work, &ONEI);
1932     MPI_Reduce(work, fmatb, sz, MPI_DOUBLE, MPI_SUM,
1933                MASTER_NO, MPI_COMM_WORLD);
1934   }
1935 }
1936 #endif
1937 
1938 static void
lin_resp_cb_b_lda_ab(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,LinResp_ab_BlData * data)1939 lin_resp_cb_b_lda_ab(DftIntegratorBl* grid, real * RESTRICT tmp,
1940 		     integer bllen, integer blstart, integer blend,
1941 		     LinResp_ab_BlData* data)
1942 {
1943   static const real MONER = -1.0;
1944   real * RESTRICT aos = grid->atv;
1945   real * RESTRICT rhowa = data->rhowa;
1946   real * RESTRICT rhowb = data->rhowb;
1947   real * RESTRICT exc_a = data->res_a;
1948   real * RESTRICT exc_b = data->res_b;
1949   real * RESTRICT vta   = data->vta;
1950   real * RESTRICT vtb   = data->vtb;
1951   real * RESTRICT tmpa  = data->tmpa;
1952   real * RESTRICT tmpb  = data->tmpb;
1953   integer ibl, i, jbl, j, k, isym, ivec;
1954   integer bl_size = DFT_BLLEN;
1955   FunSecondFuncDrv vxc;
1956   FunDensProp dp = { 0 };
1957   integer blen = bllen;
1958 
1959 
1960   for(ivec=0; ivec<data->vecs_in_batch; ivec++) {
1961     /* evaluate transformed densities */
1962     FSYM2(getexp_blocked_lda)(&data->ksymop, data->kappa_a + ivec*inforb_.n2basx,
1963 			      grid->atv, grid->bas_bl_cnt, grid->basblocks,
1964 			      &grid->shl_bl_cnt, tmp, &blen, rhowa);
1965     FSYM2(getexp_blocked_lda)(&data->ksymop, data->kappa_b + ivec*inforb_.n2basx,
1966 			      grid->atv, grid->bas_bl_cnt, grid->basblocks,
1967 			      &grid->shl_bl_cnt, tmp, &blen, rhowb);
1968     /* change sign if triplet response */
1969     if (data->trplet) FSYM(dscal)(&bl_size,&MONER,rhowb,&ONEI);
1970     for(i=blstart; i<blend; i++) {
1971       real weight = grid->weight[grid->curr_point+i];
1972       dp.rhoa = grid->r.ho.a[i];
1973       dp.rhob = grid->r.ho.b[i];
1974       drv2_clear(&vxc);
1975       selected_func->second(&vxc,weight,&dp);
1976       vta[i] = vxc.df2000*rhowa[i]+vxc.df1100*rhowb[i];
1977       vtb[i] = vxc.df0200*rhowb[i]+vxc.df1100*rhowa[i];
1978     }
1979     for(isym=0; isym<grid->nsym; isym++) {
1980       integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
1981       integer ibl_cnt = grid->bas_bl_cnt[isym];
1982       for(ibl=0; ibl<ibl_cnt; ibl++)
1983 	for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
1984 	  integer ioff = i*bllen;
1985 	  for(k=blstart; k<blend; k++){
1986 	    tmpa[k+ioff] = aos[k+ioff]*vta[k];
1987 	    tmpb[k+ioff] = aos[k+ioff]*vtb[k];
1988 	  }
1989 	}
1990       for(ibl=0; ibl<ibl_cnt; ibl++) {
1991 	for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
1992 	  integer ioff = i*inforb_.nbast + ivec*inforb_.n2basx;
1993 	  integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
1994 	  integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
1995 	  integer jbl_cnt = grid->bas_bl_cnt[jsym];
1996 	  real *RESTRICT tmpai = &tmpa[i*bllen];
1997 	  real *RESTRICT tmpbi = &tmpb[i*bllen];
1998 	  if (isym<jsym) continue;
1999 	  for(jbl=0; jbl<jbl_cnt; jbl++) {
2000 	    integer jtop = min(jblocks[jbl][1],i);
2001 	    for(j=jblocks[jbl][0]-1; j<jtop; j++) {
2002 	      for(k=blstart; k<blend; k++) {
2003 		exc_a[j+ioff] += aos[k+j*bllen]*tmpai[k];
2004 		exc_b[j+ioff] += aos[k+j*bllen]*tmpbi[k];
2005 	      }
2006 	    }
2007 	  }
2008 	  for(k=blstart; k<blend; k++) {
2009 	    exc_a[i+ioff] += aos[k+i*bllen]*tmpai[k]*0.5;
2010 	    exc_b[i+ioff] += aos[k+i*bllen]*tmpbi[k]*0.5;
2011 	  }
2012 	}
2013       }
2014     }
2015   }
2016 }
2017 
2018 static void
lin_resp_cb_b_gga_ab(DftIntegratorBl * grid,real * RESTRICT tmp,integer bllen,integer blstart,integer blend,LinResp_ab_BlData * data)2019 lin_resp_cb_b_gga_ab(DftIntegratorBl* grid, real * RESTRICT tmp,
2020 		     integer bllen, integer blstart, integer blend,
2021 		     LinResp_ab_BlData* data)
2022 {
2023   static const real MONER = -1.0;
2024   integer ibl, i, jbl, j, k, isym, ivec;
2025   integer bl_size = 4*DFT_BLLEN;
2026   real * RESTRICT aos        = grid->atv;
2027   real * RESTRICT aox        = grid->atv+bllen*inforb_.nbast;
2028   real * RESTRICT aoy        = grid->atv+bllen*inforb_.nbast*2;
2029   real * RESTRICT aoz        = grid->atv+bllen*inforb_.nbast*3;
2030   real (* RESTRICT rhowa)[4] = (real(*)[4])data->rhowa;
2031   real (* RESTRICT rhowb)[4] = (real(*)[4])data->rhowb;
2032   real (* RESTRICT vt3a)[4]  = (real(*)[4])data->vta;
2033   real (* RESTRICT vt3b)[4]  = (real(*)[4])data->vtb;
2034   real * RESTRICT exc_a      = data->res_a;
2035   real * RESTRICT exc_b      = data->res_b;
2036   real * RESTRICT tmpa       = data->tmpa;
2037   real * RESTRICT tmpb       = data->tmpb;
2038   FunDensProp dp = { 0 };
2039   FunSecondFuncDrv vxc;
2040   real znva, znvb;
2041   real zetaa, zetab, zetac;
2042   real fac0a, fac0b;
2043   real facra, facrb, facg;
2044   integer blen = bllen;
2045 
2046   for(ivec=0; ivec<data->vecs_in_batch; ivec++) {
2047     /* compute vector of transformed densities and dens. gradients */
2048     FSYM2(getexp_blocked_gga)(&data->ksymop,data->kappa_a + ivec*inforb_.n2basx,
2049 			      grid->atv, grid->bas_bl_cnt,
2050 			      grid->basblocks, &grid->shl_bl_cnt, tmp, &blen, rhowa);
2051     FSYM2(getexp_blocked_gga)(&data->ksymop,data->kappa_b + ivec*inforb_.n2basx,
2052 			      grid->atv, grid->bas_bl_cnt,
2053 			      grid->basblocks, &grid->shl_bl_cnt, tmp, &blen, rhowb);
2054     if (data->trplet) FSYM(dscal)(&bl_size,&MONER,(real*)rhowb,&ONEI);
2055     for(i=blstart; i<blend; i++) {
2056       /* Compute density parameters at grid point */
2057       real weight = grid->weight[grid->curr_point+i];
2058       dp.rhoa = grid->r.ho.a[i];
2059       dp.rhob = grid->r.ho.b[i];
2060       dp.grada = sqrt(grid->g.rad.a [i][0]*grid->g.rad.a [i][0]+
2061                             grid->g.rad.a [i][1]*grid->g.rad.a [i][1]+
2062 		      grid->g.rad.a [i][2]*grid->g.rad.a [i][2]);
2063       dp.gradb = sqrt(grid->g.rad.b [i][0]*grid->g.rad.b [i][0]+
2064                             grid->g.rad.b [i][1]*grid->g.rad.b [i][1]+
2065 		      grid->g.rad.b [i][2]*grid->g.rad.b [i][2]);
2066             dp.gradab = grid->g.rad.a [i][0]*grid->g.rad.b [i][0]+
2067                         grid->g.rad.a [i][1]*grid->g.rad.b [i][1]+
2068 	      grid->g.rad.a [i][2]*grid->g.rad.b [i][2];
2069             if(dp.grada<1e-20 || dp.rhoa<1e-20 || dp.gradb<1e-20 || dp.rhob<1e-20) {
2070 	      vt3a[i][0] = vt3a[i][1] = vt3a[i][2] = vt3a[i][3] = 0;
2071 	      vt3b[i][0] = vt3b[i][1] = vt3b[i][2] = vt3b[i][3] = 0;
2072 	      continue;
2073             }
2074             if(dp.grada<1e-20) dp.grada = 1e-20;
2075             if(dp.gradb<1e-20) dp.gradb = 1e-20;
2076             /* second derivatives calculation */
2077             drv2_clear(&vxc);
2078             selected_func->second(&vxc,weight,&dp);
2079             /* compute zetas */
2080             znva = 1.0/dp.grada;
2081             znvb = 1.0/dp.gradb;
2082             zetaa = rhowa[i][1]*grid->g.rad.a [i][0]+
2083                     rhowa[i][2]*grid->g.rad.a [i][1]+
2084 	            rhowa[i][3]*grid->g.rad.a [i][2];
2085             zetaa = zetaa*znva;
2086             zetab = rhowb[i][1]*grid->g.rad.b [i][0]+
2087                     rhowb[i][2]*grid->g.rad.b [i][1]+
2088 	            rhowb[i][3]*grid->g.rad.b [i][2];
2089             zetab = zetab*znvb;
2090             zetac = rhowa[i][1]*grid->g.rad.b [i][0]+
2091                     rhowa[i][2]*grid->g.rad.b [i][1]+
2092                     rhowa[i][3]*grid->g.rad.b [i][2]+
2093                     rhowb[i][1]*grid->g.rad.a [i][0]+
2094                     rhowb[i][2]*grid->g.rad.a [i][1]+
2095 	            rhowb[i][3]*grid->g.rad.a [i][2];
2096 	    /* rhoa and rhob dependent prefactors */
2097             fac0a = vxc.df2000*rhowa[i][0]+vxc.df1100*rhowb[i][0]+
2098 	            vxc.df1010*zetaa+vxc.df1001*zetab+vxc.df10001*zetac;
2099             fac0b = vxc.df0200*rhowb[i][0]+vxc.df1100*rhowa[i][0]+
2100 	            vxc.df0101*zetab+vxc.df0110*zetaa+vxc.df01001*zetac;
2101             vt3a[i][0] = 0.5*fac0a;
2102             vt3b[i][0] = 0.5*fac0b;
2103             /* grada and gradb dependent prefactors*/
2104             facra = vxc.df1010*rhowa[i][0]+vxc.df0110*rhowb[i][0]+
2105 	            vxc.df0020*zetaa+vxc.df0011*zetab+vxc.df00101*zetac;
2106             facrb = vxc.df0101*rhowb[i][0]+vxc.df1001*rhowa[i][0]+
2107 	            vxc.df0002*zetab+vxc.df0011*zetaa+vxc.df00011*zetac;
2108             facra = facra*znva;
2109             facrb = facrb*znvb;
2110             facg  = vxc.df10001*rhowa[i][0]+vxc.df01001*rhowb[i][0]+
2111 	            vxc.df00101*zetaa+vxc.df00011*zetab+vxc.df00002*zetac;
2112             vt3a[i][1] = grid->g.rad.a[i][0]*facra+
2113                          grid->g.rad.b[i][0]*facg+
2114                          znva*vxc.df0010*rhowa[i][1]-
2115                          znva*znva*zetaa*vxc.df0010*grid->g.rad.a[i][0]+
2116 	                 rhowb[i][1]*vxc.df00001;
2117             vt3b[i][1] = grid->g.rad.b[i][0]*facrb+
2118                          grid->g.rad.a[i][0]*facg+
2119                          znvb*vxc.df0001*rhowb[i][1]-
2120                          znvb*znvb*zetab*vxc.df0001*grid->g.rad.b[i][0]+
2121 	                 rhowa[i][1]*vxc.df00001;
2122             vt3a[i][2] = grid->g.rad.a[i][1]*facra+
2123                          grid->g.rad.b[i][1]*facg+
2124                          znva*vxc.df0010*rhowa[i][2]-
2125                          znva*znva*zetaa*vxc.df0010*grid->g.rad.a[i][1]+
2126 	                 rhowb[i][2]*vxc.df00001;
2127             vt3b[i][2] = grid->g.rad.b[i][1]*facrb+
2128                          grid->g.rad.a[i][1]*facg+
2129                          znvb*vxc.df0001*rhowb[i][2]-
2130                          znvb*znvb*zetab*vxc.df0001*grid->g.rad.b[i][1]+
2131 	                 rhowa[i][2]*vxc.df00001;
2132             vt3a[i][3] = grid->g.rad.a[i][2]*facra+
2133                          grid->g.rad.b[i][2]*facg+
2134                          znva*vxc.df0010*rhowa[i][3]-
2135                          znva*znva*zetaa*vxc.df0010*grid->g.rad.a[i][2]+
2136 	                 rhowb[i][3]*vxc.df00001;
2137             vt3b[i][3] = grid->g.rad.b[i][2]*facrb+
2138                          grid->g.rad.a[i][2]*facg+
2139                          znvb*vxc.df0001*rhowb[i][3]-
2140                          znvb*znvb*zetab*vxc.df0001*grid->g.rad.b[i][2]+
2141 	                 rhowa[i][3]*vxc.df00001;
2142     }
2143 
2144 
2145     for(isym=0; isym<grid->nsym; isym++) {
2146       integer (*RESTRICT iblocks)[2] = BASBLOCK(grid,isym);
2147       integer ibl_cnt = grid->bas_bl_cnt[isym];
2148 
2149       for(ibl=0; ibl<ibl_cnt; ibl++) {
2150 	for(i=iblocks[ibl][0]-1; i<iblocks[ibl][1]; i++) {
2151 	  real *RESTRICT g0i = &aos[i*bllen];
2152 	  real *RESTRICT gxi = &aox[i*bllen];
2153 	  real *RESTRICT gyi = &aoy[i*bllen];
2154 	  real *RESTRICT gzi = &aoz[i*bllen];
2155 	  integer ioff = i*inforb_.nbast + ivec*inforb_.n2basx;
2156 	  integer jsym = inforb_.muld2h[data->ksymop-1][isym]-1;
2157 	  integer (*RESTRICT jblocks)[2] = BASBLOCK(grid,jsym);
2158 	  integer jbl_cnt = grid->bas_bl_cnt[jsym];
2159 	  for(k=blstart; k<blend; k++) {
2160                       tmpa[k] = vt3a[k][0]*g0i[k]+
2161                                 vt3a[k][1]*gxi[k]+
2162                                 vt3a[k][2]*gyi[k]+
2163 			        vt3a[k][3]*gzi[k];
2164                       tmpb[k] = vt3b[k][0]*g0i[k]+
2165                                 vt3b[k][1]*gxi[k]+
2166                                 vt3b[k][2]*gyi[k]+
2167 		        	vt3b[k][3]*gzi[k];
2168 	  }
2169 	  for(jbl=0; jbl<jbl_cnt; jbl++) {
2170 	    integer jtop = jblocks[jbl][1];
2171 	    for(j=jblocks[jbl][0]-1; j<jtop; j++) {
2172 	      real *RESTRICT g0j = &aos[j*bllen];
2173 	      real sa = 0;
2174 	      real sb = 0;
2175 	      for(k=blstart; k<blend; k++) {
2176 		sa += g0j[k]*tmpa[k];
2177 		sb += g0j[k]*tmpb[k];
2178 	      }
2179 	      exc_a[j+ioff] += sa;
2180 	      exc_b[j+ioff] += sb;
2181 	    }
2182 	  }
2183 	}
2184       }
2185     }
2186   }
2187 }
2188 
2189 
2190 void
FSYM2(dft_lin_respab_b)2191 FSYM2(dft_lin_respab_b)(integer *nosim, real* fmatc, real* fmato, real *cmo,
2192 			real *zymat, integer *trplet, integer *ksymop, real* work,
2193 			integer* lwork, integer* iprint)
2194 {
2195   static const integer MAX_VEC = 5;
2196   static const real DP5R  = 0.5;
2197   static const real MDP5R = -0.5;
2198 
2199   struct tms starttm, endtm; clock_t utm;
2200   real electrons = 0;
2201   LinResp_ab_BlData lr_data;
2202   real *runit;
2203   integer max_vecs;
2204   integer i, j, jvec, ivec;
2205   real averag;
2206   real * fmata, * fmatb;
2207 
2208   /* parallel code insert */
2209 #if defined(VAR_MPI)
2210   dft_wake_slaves((DFTPropEvalMaster)FSYM2(dft_lin_respab_b));
2211   dft_lin_resp_sync_slaves(cmo,nosim,&zymat,trplet,ksymop,
2212 			   &work, *lwork);
2213 #endif
2214   /* start timer and allocate memory */
2215   times(&starttm);
2216   max_vecs = *nosim>MAX_VEC ? MAX_VEC : *nosim;
2217   lr_data.dmata   = dal_malloc(2*inforb_.n2basx*sizeof(real));
2218   lr_data.dmatb   = lr_data.dmata+inforb_.n2basx;
2219   lr_data.res_a   = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
2220   lr_data.res_b   = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
2221   lr_data.kappa_a = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
2222   lr_data.kappa_b = dal_malloc(inforb_.n2basx*sizeof(real)*max_vecs);
2223   lr_data.tmpa    = dal_malloc(inforb_.nbast*DFT_BLLEN*sizeof(real));
2224   lr_data.tmpb    = dal_malloc(inforb_.nbast*DFT_BLLEN*sizeof(real));
2225   if (selected_func->is_gga()) {
2226     lr_data.rhowa    = dal_malloc(DFT_BLLEN*4*sizeof(real));
2227     lr_data.rhowb    = dal_malloc(DFT_BLLEN*4*sizeof(real));
2228     lr_data.vta      = dal_malloc(4*DFT_BLLEN*sizeof(real));
2229     lr_data.vtb      = dal_malloc(4*DFT_BLLEN*sizeof(real));
2230   } else {
2231     lr_data.rhowa    = dal_malloc(DFT_BLLEN*sizeof(real));
2232     lr_data.rhowb    = dal_malloc(DFT_BLLEN*sizeof(real));
2233     lr_data.vta      = dal_malloc(DFT_BLLEN*sizeof(real));
2234     lr_data.vtb      = dal_malloc(DFT_BLLEN*sizeof(real));
2235   }
2236   lr_data.dtgaoa  = dal_malloc(inforb_.nbast*sizeof(real));
2237   lr_data.dtgaob  = dal_malloc(inforb_.nbast*sizeof(real));
2238   lr_data.trplet= *trplet;
2239   lr_data.ksymop= *ksymop;
2240   /* get density in AO basis*/
2241   FSYM2(dft_get_ao_dens_matab)(cmo,lr_data.dmatb,lr_data.dmata,work,lwork);
2242   FSYM(daxpy)(&inforb_.n2basx,&DP5R,lr_data.dmatb,&ONEI,lr_data.dmata,&ONEI);
2243   FSYM(dscal)(&inforb_.n2basx,&DP5R,lr_data.dmatb,&ONEI);
2244   /* get active density in MO basis */
2245   runit=dal_malloc(inforb_.n2ashx*sizeof(real));
2246   FSYM(dunit)(runit,&inforb_.nasht);
2247   /* MO alpha, beta Vxc contributions */
2248   integer sz1 = max_vecs*inforb_.n2orbx;
2249   integer sz1_int= sz1;
2250   fmata=dal_malloc(sz1_int*sizeof(real));
2251   fmatb=dal_malloc(sz1_int*sizeof(real));
2252   for(ivec=0; ivec<*nosim; ivec+=max_vecs) {
2253     integer sz;
2254     lr_data.vecs_in_batch = ivec + max_vecs > *nosim ? *nosim - ivec : max_vecs;
2255     sz = lr_data.vecs_in_batch * inforb_.n2basx;
2256     FSYM(dzero)(lr_data.kappa_a, &sz);
2257     FSYM(dzero)(lr_data.kappa_b, &sz);
2258     for(jvec=0; jvec<lr_data.vecs_in_batch; jvec++)
2259       FSYM(deq27)(cmo,zymat+(ivec+jvec)*inforb_.n2orbx,runit,
2260 		  lr_data.kappa_b+jvec*inforb_.n2basx,
2261 		  lr_data.kappa_a+jvec*inforb_.n2basx,
2262 		  work,lwork);
2263     dscal_(&sz,&DP5R,lr_data.kappa_b,&ONEI);
2264     FSYM(daxpy)(&sz,&ONER,lr_data.kappa_b,&ONEI,lr_data.kappa_a,&ONEI);
2265     FSYM(dzero)(lr_data.res_a, &sz);
2266     FSYM(dzero)(lr_data.res_b, &sz);
2267     electrons = dft_integrate_ao_bl(2, lr_data.dmata, work, lwork, iprint, 0,
2268 				    (DftBlockCallback)(selected_func->is_gga() ?
2269 				    lin_resp_cb_b_gga_ab : lin_resp_cb_b_lda_ab),
2270 				    &lr_data);
2271 #ifdef VAR_MPI
2272     dft_lin_respab_b_collect(lr_data.vecs_in_batch, lr_data.res_a, lr_data.res_b, work, *lwork);
2273     if(fmatc == NULL)  continue;
2274 #endif
2275     FSYM(dzero)(fmata, &sz1);
2276     FSYM(dzero)(fmatb, &sz1);
2277     for(jvec=0; jvec<lr_data.vecs_in_batch; jvec++){
2278       for(i=0; i<inforb_.nbast; i++) {
2279 	integer ioff = i*inforb_.nbast + jvec*inforb_.n2basx;
2280 	for(j=0; j<i; j++) {
2281 	  integer joff = j*inforb_.nbast + jvec*inforb_.n2basx;
2282 	  averag = 0.5*(lr_data.res_a[i+joff] +
2283 			lr_data.res_a[j+ioff]);
2284 	  lr_data.res_a[i+joff] = lr_data.res_a[j+ioff] = averag;
2285 	  averag = 0.5*(lr_data.res_b[i+joff] +
2286 			lr_data.res_b[j+ioff]);
2287 	  lr_data.res_b[i+joff] = lr_data.res_b[j+ioff] = averag;
2288 	}
2289       }
2290       /* transform to MO Fock matrix contribution  */
2291       FSYM(lrao2mo)(cmo, &lr_data.ksymop, lr_data.res_a+jvec*inforb_.n2basx,
2292 		    fmata+jvec*inforb_.n2orbx, work, lwork);
2293       FSYM(lrao2mo)(cmo, &lr_data.ksymop, lr_data.res_b+jvec*inforb_.n2basx,
2294 		    fmatb+jvec*inforb_.n2orbx, work, lwork);
2295     }
2296     integer sz2 = inforb_.n2orbx*lr_data.vecs_in_batch;
2297     FSYM(daxpy)(&sz2,&ONER,fmata,&ONEI,fmatc+ivec*inforb_.n2orbx,&ONEI);
2298     if (inforb_.nasht > 0) {
2299       if (*trplet) {
2300 	FSYM(daxpy)(&sz2,&MDP5R,fmatb,&ONEI,fmato+ivec*inforb_.n2orbx,&ONEI);
2301       } else {
2302 	FSYM(daxpy)(&sz2,&DP5R,fmatb,&ONEI,fmato+ivec*inforb_.n2orbx,&ONEI);
2303       }
2304       FSYM(daxpy)(&sz2,&MDP5R,fmata,&ONEI,fmato+ivec*inforb_.n2orbx,&ONEI);
2305     }
2306   }
2307   free(runit);
2308   free(fmata);
2309   free(fmatb);
2310   free(lr_data.dmata);
2311   free(lr_data.res_a);
2312   free(lr_data.res_b);
2313   free(lr_data.kappa_a);
2314   free(lr_data.kappa_b);
2315   free(lr_data.rhowa);
2316   free(lr_data.rhowb);
2317   free(lr_data.vta);
2318   free(lr_data.vtb);
2319   free(lr_data.tmpa);
2320   free(lr_data.tmpb);
2321   free(lr_data.dtgaoa);
2322   free(lr_data.dtgaob);
2323 
2324   if (*iprint>0) {
2325     times(&endtm);
2326     utm = endtm.tms_utime-starttm.tms_utime;
2327     fort_print("      Electrons: %f(%9.3g): LR-AB-DFT*%d evaluation time: %9.1f s",
2328 	       electrons, (double)(electrons-(integer)(electrons+0.5)), *nosim,
2329 	       utm/(double)sysconf(_SC_CLK_TCK));
2330   }
2331 }
2332 
2333