1
2*     ****************************
3*     *                    	 *
4*     *   pspw_potential_sHFX0   *
5*     *                          *
6*     ****************************
7      subroutine pspw_potential_sHFX0(ispin0,psi_r,Hpsi_r)
8      implicit none
9      integer ispin0
10      real  psi_r(*)
11      real Hpsi_r(*)
12
13#include "bafdecls.fh"
14#include "pspw_hfx.fh"
15#include "errquit.fh"
16
17      integer istart,iend,jstart,jend,imodn,imodtask
18      integer ms,l,q,n,indx1,indx2,Levels,neq(2)
19      integer requests(5),reqcnt
20
21      integer  Butter_Levels,Dneall_na_ptr
22      external Butter_Levels,Dneall_na_ptr
23
24*     ***** now do exchange as normal ****
25!$OMP MASTER
26      ehfx = 0.0d0
27      phfx = 0.0d0
28!$OMP END MASTER
29      if (((norbs(1)+norbs(2)).ne.0).and.relaxed) then
30
31         if (replicated) then
32
33*           **** butterfly algorithm ****
34            if (butterfly) then
35               call Dneall_neq(neq)
36               Levels = Butter_Levels(npj)
37               do ms=1,ispin
38                  call Parallel_shared_vector_szero(.false.,nrsize,
39     >                          real_mb(Hpsi_r_replicated(1)))
40                  call Parallel_shared_vector_scopy(.true.,
41     >                       neq(ms)*n2ft3d,
42     >                       psi_r(1+(ms-1)*neq(1)*n2ft3d),
43     >                       real_mb(psi_r_replicated(1)))
44
45                  do l=0,Levels-1
46                     call D1dBs_Brdcst_step(l,
47     >                       int_mb(Dneall_na_ptr(ms)),-1,
48     >                       n2ft3d,
49     >                       real_mb(psi_r_replicated(1)),
50     >                       requests,reqcnt)
51
52                     call Butter_indexes(l,taskid_j,npj,
53     >                       int_mb(Dneall_na_ptr(ms)),
54     >                       istart,iend,jstart,jend,
55     >                       imodn,imodtask)
56                     call pspw_potential_sHFX_sub2(solver_type,
57     >                                  istart,iend,
58     >                                  jstart,jend,
59     >                                  imodn,imodtask,
60     >                                  n2ft3d,
61     >                                  real_mb(psi_r_replicated(1)),
62     >                                  real_mb(Hpsi_r_replicated(1)),
63     >                                  ehfx)
64
65                     call D1dB_WaitAll(requests,reqcnt)
66                  end do
67
68                  call Butter_indexes_L1(taskid_j,npj,
69     >                       int_mb(Dneall_na_ptr(ms)),
70     >                       istart,iend,jstart,jend,
71     >                       imodn,imodtask)
72                  if (jend.ge.jstart)
73     >               call pspw_potential_sHFX_sub2(solver_type,
74     >                               istart,iend,
75     >                               jstart,jend,
76     >                               imodn,imodtask,
77     >                               n2ft3d,
78     >                               real_mb(psi_r_replicated(1)),
79     >                               real_mb(Hpsi_r_replicated(1)),
80     >                               ehfx)
81                  call Butter_indexes_L2(taskid_j,npj,
82     >                       int_mb(Dneall_na_ptr(ms)),
83     >                       istart,iend,jstart,jend,
84     >                       imodn,imodtask)
85                  call pspw_potential_sHFX_sub2(solver_type,
86     >                               istart,iend,
87     >                               jstart,jend,
88     >                               imodn,imodtask,
89     >                               n2ft3d,
90     >                               real_mb(psi_r_replicated(1)),
91     >                               real_mb(Hpsi_r_replicated(1)),
92     >                               ehfx)
93
94                  do l=Levels-1,0,-1
95                     call D1dBs_Reduce_step(l,
96     >                       int_mb(Dneall_na_ptr(ms)),-1,
97     >                       n2ft3d,
98     >                       real_mb(Hpsi_r_replicated(1)),
99     >                       real_mb(psi_r_replicated(1)))
100                  end do
101                  call SAXPY_OMP(neq(ms)*n2ft3d,hfx_parameter,
102     >                       real_mb(Hpsi_r_replicated(1)),1,
103     >                       Hpsi_r(1+(ms-1)*neq(1)*n2ft3d),1)
104               end do
105
106
107*              *** apply hfx_parameter ****
108!$OMP MASTER
109               ehfx = ehfx*hfx_parameter
110
111               if (ispin.eq.1) ehfx = ehfx + ehfx
112!$OMP END MASTER
113               call Parallel_SumAll(ehfx)
114!$OMP MASTER
115               phfx = 2.0d0*ehfx
116!$OMP END MASTER
117
118*           **** reduceall algorithm ****
119            else
120            call Parallel_shared_vector_szero(.false.,nrsize,
121     >                                 real_mb(psi_r_replicated(1)))
122            call Parallel_shared_vector_szero(.true.,nrsize,
123     >                                 real_mb(Hpsi_r_replicated(1)),1)
124            do q=1,neqall
125               call Dneall_qton(q,n)
126               indx1 = (q-1)*n2ft3d + 1
127               indx2 = psi_r_replicated(1)+(n-1)*n2ft3d
128               call Parallel_shared_vector_scopy(.true.,n2ft3d,
129     >                                      psi_r(indx1),real_mb(indx2))
130            end do
131            call D1dBs_Vector_SumAll(nrsize,
132     >                       real_mb(psi_r_replicated(1)))
133            call pspw_potential_sHFX_sub(ispin0,
134     >                                  real_mb(psi_r_replicated(1)),
135     >                                  real_mb(Hpsi_r_replicated(1)))
136            call D1dBs_Vector_SumAll(nrsize,
137     >                       real_mb(Hpsi_r_replicated(1)))
138            do q=1,neqall
139               call Dneall_qton(q,n)
140               indx1 = Hpsi_r_replicated(1)+(n-1)*n2ft3d
141               indx2 = (q-1)*n2ft3d + 1
142               call SAXPY_OMP(n2ft3d,1.0d0,real_mb(indx1),1,
143     >                                     Hpsi_r(indx2),1)
144            end do
145            end if
146
147         else
148            call pspw_potential_sHFX_sub(ispin0,psi_r,Hpsi_r)
149         end if
150
151      end if
152
153      return
154      end
155
156
157
158*     *************************
159*     *                       *
160*     *    pspw_energy_sHFX0  *
161*     *                       *
162*     *************************
163      subroutine pspw_energy_sHFX0(ispin0,psi_r,ehfx_out,phfx_out)
164      implicit none
165      integer ispin0
166      real  psi_r(*)
167      real*8 ehfx_out
168      real*8 phfx_out
169
170#include "bafdecls.fh"
171#include "pspw_hfx.fh"
172#include "errquit.fh"
173
174      integer q,n,indx1,indx2
175
176      if (((norbs(1)+norbs(2)).ne.0).and.(.not.relaxed)) then
177
178c     **** calculate HFX energy  ****
179         if (replicated) then
180
181            call Parallel_shared_vector_szero(.true.,nrsize,
182     >                                    real_mb(psi_r_replicated(1)))
183            do q=1,neqall
184               call Dneall_qton(q,n)
185               indx1 = (q-1)*n2ft3d + 1
186               indx2 = psi_r_replicated(1)+(n-1)*n2ft3d
187               call Parallel_shared_vector_scopy(.true.,n2ft3d,
188     >                                     psi_r(indx1),real_mb(indx2))
189            end do
190            call D1dBs_Vector_SumAll(nrsize,
191     >                       real_mb(psi_r_replicated(1)))
192            call pspw_energy_sHFX_sub(ispin0,
193     >                               real_mb(psi_r_replicated(1)),
194     >                               ehfx_out,phfx_out)
195
196         else
197
198            call pspw_energy_sHFX_sub(ispin0,psi_r,ehfx_out,phfx_out)
199
200         end if
201
202c     **** nothing to do ****
203      else
204         ehfx_out = ehfx
205         phfx_out = phfx
206      end if
207
208      return
209      end
210
211
212
213
214*     ********************************
215*     *                    	     *
216*     *    pspw_potential_sHFX_orb   *
217*     *                              *
218*     ********************************
219      subroutine pspw_potential_sHFX_orb(ms,psi_r,
220     >                                  orb_r,Horb_r)
221      implicit none
222      integer    ms
223      real     psi_r(*)
224      real     orb_r(*)
225      real     Horb_r(*)
226
227#include "bafdecls.fh"
228#include "pspw_hfx.fh"
229#include "errquit.fh"
230
231*     **** local variables ****
232      logical value
233      integer i,j,n1,n2,n3
234      integer dn(2),vij(2),tmp1(2),index2
235      real  scal1,scal2,dv,seh
236      real*8  eh,ph
237
238*     **** external functions ****
239      real*8   lattice_omega
240      external lattice_omega
241      real     coulomb_screened_s_e
242      external coulomb_screened_s_e
243
244
245      call nwpw_timing_start(33)
246      if ((norbs(ms).ne.0).and.relaxed) then
247        call D3dB_nx(1,n1)
248        call D3dB_ny(1,n2)
249        call D3dB_nz(1,n3)
250        !call D3dB_n2ft3d(1,n2ft3d)
251        value = BA_push_get(mt_real,(n2ft3d),'dn_hfx',dn(2),dn(1))
252        value = value.and.
253     >          BA_push_get(mt_real,(n2ft3d),'vij_hfx',vij(2),vij(1))
254        value = value.and.
255     >          BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
256        if (.not. value) call errquit('out of stack memory',0, MA_ERR)
257        call Parallel_shared_vector_szero(.false.,n2ft3d,
258     >                                    real_mb(dn(1)))
259        call Parallel_shared_vector_szero(.false.,n2ft3d,
260     >                                   real_mb(vij(1)))
261        call Parallel_shared_vector_szero(.true.,n2ft3d,
262     >                                    real_mb(tmp1(1)))
263
264        scal1 = real(1.0d0/dble(n1*n2*n3))
265        scal2 = real(1.0d0/lattice_omega())
266        dv = scal1/scal2
267
268        do j=1,norbs(ms)
269           index2 = (int_mb(orbital_list(1,ms)+j-1)-1)*n2ft3d + 1
270
271*          **** generate dnij for Vij  ****
272           call D3dBs_rr_Mul(1,psi_r(index2),orb_r,real_mb(dn(1)))
273           call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
274           call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
275
276*          ***** screened coulomb solver ****
277           if (solver_type.eq.1) then
278             call D3dBs_r_SMul1(1,scal1,real_mb(dn(1)))
279             call D3dBs_rc_pfft3f(1,0,real_mb(dn(1)))
280             call Packs_c_pack(0,real_mb(dn(1)))
281
282*            **** get Ecoul energy ****
283             eh = dble(coulomb_screened_s_e(real_mb(dn(1))))
284
285*            **** generate Vcoul ****
286             call coulomb_screened_s_v(real_mb(dn(1)),real_mb(vij(1)))
287             call Packs_c_unpack(0,real_mb(vij(1)))
288             call D3dBs_cr_pfft3b(1,0,real_mb(vij(1)))
289
290*          ***** free-space coulomb solver ****
291           else
292              call coulomb2_s_v(real_mb(dn(1)),real_mb(vij(1)))
293              call D3dBs_rr_dot(1,real_mb(dn(1)),real_mb(vij(1)),seh)
294              eh = dble(0.5*seh*dv)
295           end if
296
297*          **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
298           eh = eh*hfx_parameter
299           call D3dBs_r_SMul1(1,real(hfx_parameter),real_mb(vij(1)))
300           if (ispin.eq.1) eh = eh + eh
301           ph = 2.0d0*eh
302
303*          **** generate (Vij)*psi_r ***
304           call D3dBs_rr_Mul(1,real_mb(vij(1)),
305     >                        psi_r(index2),
306     >                        real_mb(tmp1(1)))
307           call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
308
309*          **** add -(Vij)*psi_r to Hpsi_r ***
310           call D3dBs_rr_Sub(1,Horb_r,
311     >                        real_mb(tmp1(1)),
312     >                        Horb_r)
313        end do
314
315        value = value.and.BA_pop_stack(tmp1(2))
316        value = value.and.BA_pop_stack(vij(2))
317        value = value.and.BA_pop_stack(dn(2))
318        if (.not. value)
319     >    call errquit('pspw_potential_sHFX_orb:popping stack memory',0,
320     &       MA_ERR)
321      end if
322      call nwpw_timing_end(33)
323      return
324      end
325
326
327c***************** sub/replicated routines *****************************
328
329*     ********************************
330*     *                    	     *
331*     *    pspw_potential_sHFX_sub   *
332*     *                              *
333*     ********************************
334      subroutine pspw_potential_sHFX_sub(ispin0,psi_r,Hpsi_r)
335      implicit none
336      integer ispin0
337      real    psi_r(*)
338      real    Hpsi_r(*)
339
340#include "bafdecls.fh"
341#include "pspw_hfx.fh"
342#include "errquit.fh"
343
344*     **** local variables ****
345      logical value,done
346      integer i,j,n1,n2,n3,ms
347      integer dn(2),vij(2),tmp1(2),tmp2(2),index1,index2
348      integer i1,j1,k1,NN
349      integer i2,j2,k2
350      integer i3,j3,k3
351      real    scal1,scal2,dv,seh
352      real*8  eh,ph,ss,teh
353      integer center(3)
354      real*8  rcenter(3)
355      integer taskid,icount
356      real*8 cpu0,cpu1
357      integer ktaskjid,kcompute(2)
358
359*     **** external functions ****
360      real*8   lattice_omega
361      external lattice_omega
362
363      logical  D3dBs_rc_pfft3_queue_filled,D3dBs_cr_pfft3_queue_filled
364      external D3dBs_rc_pfft3_queue_filled,D3dBs_cr_pfft3_queue_filled
365      logical  pspw_hfx_localize_closeenough
366      external pspw_hfx_localize_closeenough
367
368      real*8   pspw_hfx_localize_switchr
369      external pspw_hfx_localize_switchr
370
371      real     icoulomb_screened_s_e
372      external icoulomb_screened_s_e
373      real     icoulomb_screened_small_s_e
374      external icoulomb_screened_small_s_e
375
376      call Parallel2d_taskid_i(taskid)
377      icount = 0
378      call current_second(cpu0)
379
380!$OMP MASTER
381      ehfx = 0.0d0
382      phfx = 0.0d0
383!$OMP END MASTER
384      if (((norbs(1)+norbs(2)).ne.0).and.relaxed) then
385        !call D3dB_nx(1,n1)
386        !call D3dB_ny(1,n2)
387        !call D3dB_nz(1,n3)
388        !call D3dB_n2ft3d(1,n2ft3d)
389        value = BA_push_get(mt_real,(n2ft3d),'dn_hfx',dn(2),dn(1))
390        value = value.and.
391     >          BA_push_get(mt_real,(n2ft3d),'vij_hfx',vij(2),vij(1))
392        value = value.and.
393     >          BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
394        value = value.and.
395     >          BA_push_get(mt_real,(n2ft3d),'tmp2_hfx',tmp2(2),tmp2(1))
396        NN = norbs(1)*(norbs(1)+1)/2
397        value = value.and.
398     >          BA_push_get(mt_int,NN,'kcmpute',kcompute(2),kcompute(1))
399        if (.not.value)
400     >     call errquit('pspw_potential_HFX_sub:out of stack memory',
401     >                  0,MA_ERR)
402        call Parallel_shared_vector_szero(.false.,
403     >                                    n2ft3d,real_mb(dn(1)))
404        call Parallel_shared_vector_szero(.false.,
405     >                                    n2ft3d,real_mb(vij(1)))
406        call Parallel_shared_vector_szero(.false.,
407     >                                    n2ft3d,real_mb(tmp1(1)))
408        call Parallel_shared_vector_szero(.true.,
409     >                                    n2ft3d,real_mb(tmp2(1)))
410
411        call D3dB_nx(1,n1)
412        call D3dB_ny(1,n2)
413        call D3dB_nz(1,n3)
414        scal1 = real(1.0d0/dble(n1*n2*n3))
415        scal2 = real(1.0d0/lattice_omega())
416        dv = scal1/scal2
417
418        if (localize_on.and.has_smallgrid) then
419           call D3dB_nx(3,n1)
420           call D3dB_ny(3,n2)
421           call D3dB_nz(3,n3)
422           scal1 = real(1.0d0/dble(n1*n2*n3))
423        end if
424
425*       **** compute kcompute ****
426        ktaskjid = 0
427
428*       ***** screened coulomb solver ****
429        if (solver_type.eq.1) then
430        do ms=1,ispin0
431           if (norbs(ms).eq.0) go to 898
432           call Parallel_shared_vector_zero(.true.,norbs(ms),
433     >                                dbl_mb(ehfx_orb(1,ms)))
434           NN = norbs(ms)*(norbs(ms)+1)/2
435
436*          **** compute kcompute ****
437           i1 = 1
438           j1 = 1
439           do k1=1,NN
440              if (pspw_hfx_localize_closeenough(
441     >                   int_mb(orbital_list(1,ms)+i1-1),
442     >                   int_mb(orbital_list(1,ms)+j1-1))) then
443                 int_mb(kcompute(1)+k1-1) = ktaskjid
444                 ktaskjid = mod(ktaskjid+1,npj)
445              else
446                 int_mb(kcompute(1)+k1-1) = npj+1
447              end if
448              j1 = j1 + 1
449              if (j1.gt.i1) then
450                 j1 = 1
451                 i1 = i1 + 1
452              end if
453           end do
454
455           i1 = 1
456           j1 = 1
457           k1 = 1
458           i2 = 1
459           j2 = 1
460           k2 = 1
461           i3 = 1
462           j3 = 1
463           k3 = 1
464           done = .false.
465           do while (.not.done)
466
467              if ((k1.le.NN).and.
468     >            (.not.D3dBs_rc_pfft3_queue_filled())) then
469
470                 !if (mod(k1,npj).eq.taskid_j) then
471                 if (int_mb(kcompute(1)+k1-1).eq.taskid_j) then
472                 if (pspw_hfx_localize_closeenough(
473     >                   int_mb(orbital_list(1,ms)+i1-1),
474     >                   int_mb(orbital_list(1,ms)+j1-1))) then
475
476                    index1 =(int_mb(orbital_list(1,ms)+i1-1)-1)*n2ft3d+1
477                    index2 =(int_mb(orbital_list(1,ms)+j1-1)-1)*n2ft3d+1
478
479*                   **** generate dnij for Vij  ****
480                    call D3dBs_rr_Mul(1,psi_r(index2),
481     >                                  psi_r(index1),real_mb(dn(1)))
482
483                    call D3dBs_r_SMul1(1,scal2*scal1,real_mb(dn(1)))
484                    call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
485
486                    if (localize_on.and.has_smallgrid) then
487                       call pspw_hfx_localize_center_ovlp(
488     >                          int_mb(orbital_list(1,ms)+i1-1),
489     >                          int_mb(orbital_list(1,ms)+j1-1),center)
490                       call D3dBs_rc_pfft3f_queuein_center(2,center,
491     >                                             real_mb(dn(1)))
492                    else
493                       call D3dBs_rc_pfft3f_queuein(0,real_mb(dn(1)))
494                    end if
495c
496
497
498                 end if
499                 end if
500
501                 k1 = k1 + 1
502                 j1 = j1 + 1
503                 if (j1.gt.i1) then
504                    j1 = 1
505                    i1 = i1 + 1
506                 end if
507              end if
508
509              if ((     ((D3dBs_rc_pfft3_queue_filled()).or.(k1.gt.NN))
510     >            .and.(k2.le.NN)).and.
511     >            (.not.D3dBs_cr_pfft3_queue_filled())) then
512
513                 !if (mod(k2,npj).eq.taskid_j) then
514                 if (int_mb(kcompute(1)+k2-1).eq.taskid_j) then
515                 if (pspw_hfx_localize_closeenough(
516     >                   int_mb(orbital_list(1,ms)+i2-1),
517     >                   int_mb(orbital_list(1,ms)+j2-1))) then
518
519                    ss = pspw_hfx_localize_switchr(
520     >                   int_mb(orbital_list(1,ms)+i2-1),
521     >                   int_mb(orbital_list(1,ms)+j2-1))
522
523                    if (localize_on.and.has_smallgrid) then
524                       call D3dBs_rc_pfft3f_queueout_center(2,
525     >                                              real_mb(dn(1)))
526                       eh = dble(icoulomb_screened_small_s_e(
527     >                                                real_mb(dn(1))))
528                       call coulomb_screened_small_s_v(real_mb(dn(1)),
529     >                                                 real_mb(vij(1)))
530                    else
531                       call D3dBs_rc_pfft3f_queueout(0,real_mb(dn(1)))
532                       eh = dble(icoulomb_screened_s_e(real_mb(dn(1))))
533                       call coulomb_screened_s_v(real_mb(dn(1)),
534     >                                           real_mb(vij(1)))
535                    end if
536
537*                   **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
538                    eh = eh*hfx_parameter*ss
539                    if (ispin0.eq.1) eh = eh + eh
540                    ph = 2.0d0*eh
541!$OMP MASTER
542                    ehfx = ehfx - eh
543                    phfx = phfx - ph
544                    dbl_mb(ehfx_orb(1,ms)+i2-1)
545     >               = dbl_mb(ehfx_orb(1,ms)+i2-1) - eh
546!$OMP END MASTER
547                    if (i2.ne.j2) then
548!$OMP MASTER
549                       ehfx = ehfx - eh
550                       phfx = phfx - ph
551                       dbl_mb(ehfx_orb(1,ms)+i2-1)
552     >                  = dbl_mb(ehfx_orb(1,ms)+i2-1) - eh
553!$OMP END MASTER
554                    end if
555
556                    if (localize_on.and.has_smallgrid) then
557                       call pspw_hfx_localize_center_ovlp(
558     >                          int_mb(orbital_list(1,ms)+i2-1),
559     >                          int_mb(orbital_list(1,ms)+j2-1),center)
560                       call D3dBs_cr_pfft3b_queuein_center(2,center,
561     >                                             real_mb(vij(1)))
562                    else
563                       call D3dBs_cr_pfft3b_queuein(0,real_mb(vij(1)))
564                    end if
565
566                 end if
567                 end if
568
569                 k2 = k2 + 1
570                 j2 = j2 + 1
571                 if (j2.gt.i2) then
572                    j2 = 1
573                    i2 = i2 + 1
574                 end if
575              end if
576
577              if ((D3dBs_cr_pfft3_queue_filled()).or.(k2.gt.NN)) then
578
579                 !if (mod(k3,npj).eq.taskid_j) then
580                 if (int_mb(kcompute(1)+k3-1).eq.taskid_j) then
581                 if (pspw_hfx_localize_closeenough(
582     >                   int_mb(orbital_list(1,ms)+i3-1),
583     >                   int_mb(orbital_list(1,ms)+j3-1))) then
584
585                    index1 =(int_mb(orbital_list(1,ms)+i3-1)-1)*n2ft3d+1
586                    index2 =(int_mb(orbital_list(1,ms)+j3-1)-1)*n2ft3d+1
587
588                    ss = pspw_hfx_localize_switchr(
589     >                   int_mb(orbital_list(1,ms)+i3-1),
590     >                   int_mb(orbital_list(1,ms)+j3-1))
591
592                    if (localize_on.and.has_smallgrid) then
593                       call D3dBs_cr_pfft3b_queueout_center(2,
594     >                                              real_mb(vij(1)))
595                    else
596                       call D3dBs_cr_pfft3b_queueout(0,real_mb(vij(1)))
597                    end if
598
599
600*                   **** apply hfx_parameter ****
601                    call D3dBs_r_SMul1(1,real(hfx_parameter*ss),
602     >                                real_mb(vij(1)))
603
604*                   **** generate (Vij)*psi_r ***
605                    call D3dBs_rr_Mul(1,real_mb(vij(1)),
606     >                                 psi_r(index2),
607     >                                 real_mb(tmp1(1)))
608                    call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
609
610*                   **** add -(Vij)*psi_r to Hpsi_r ***
611                    call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),
612     >                                    Hpsi_r(index1))
613
614                    !**** include off diagonal terms ****
615                    if (i3.ne.j3) then
616*                      **** generate (Vij)*psi_r ***
617                       call D3dBs_rr_Mul(1,real_mb(vij(1)),
618     >                                 psi_r(index1),
619     >                                 real_mb(tmp2(1)))
620                       call D3dBs_r_Zero_Ends(1,real_mb(tmp2(1)))
621
622*                      **** add -(Vij)*psi_r to Hpsi_r ***
623                       call D3dBs_rr_Sub2(1,real_mb(tmp2(1)),
624     >                                       Hpsi_r(index2))
625                    end if
626                 end if
627                 end if
628
629                 k3 = k3 + 1
630                 j3 = j3 + 1
631                 if (j3.gt.i3) then
632                    j3 = 1
633                    i3 = i3 + 1
634                 end if
635              end if
636              done = (k1.gt.NN).and.(k2.gt.NN).and.(k3.gt.NN)
637           end do !**** while ****
638           call Parallel_Vector_SumAll(norbs(ms),
639     >                                 dbl_mb(ehfx_orb(1,ms)))
640
641  898      continue
642       end do !**** ms *****
643
644*       ***** free-space coulomb solver ****
645        else
646        k1 = 1
647        do ms=1,ispin0
648        do i=1,norbs(ms)
649!$OMP MASTER
650         dbl_mb(ehfx_orb(1,ms)+i-1) = 0.0d0
651!$OMP END MASTER
652         do j=1,i
653           if (mod(k1,npj).eq.taskid_j) then
654           if (pspw_hfx_localize_closeenough(
655     >                   int_mb(orbital_list(1,ms)+i-1),
656     >                   int_mb(orbital_list(1,ms)+j-1))) then
657
658              index1 = (int_mb(orbital_list(1,ms)+i-1)-1)*n2ft3d + 1
659              index2 = (int_mb(orbital_list(1,ms)+j-1)-1)*n2ft3d + 1
660
661              ss = pspw_hfx_localize_switchr(
662     >             int_mb(orbital_list(1,ms)+i-1),
663     >             int_mb(orbital_list(1,ms)+j-1))
664
665*             **** generate dnij for Vij  ****
666              call D3dBs_rr_Mul(1,psi_r(index2),psi_r(index1),
667     >                           real_mb(dn(1)))
668              call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
669              call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
670
671              call coulomb2_s_v(real_mb(dn(1)),real_mb(vij(1)))
672              call D3dBs_rr_idot(1,real_mb(dn(1)),real_mb(vij(1)),seh)
673              eh = dble(0.5*seh*dv)
674
675*             **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
676              eh = eh*hfx_parameter*ss
677              call D3dBs_r_SMul1(1,real(hfx_parameter*ss),
678     >                              real_mb(vij(1)))
679              if (ispin0.eq.1) eh = eh + eh
680              ph = 2.0d0*eh
681
682
683!$OMP MASTER
684              ehfx = ehfx - eh
685              phfx = phfx - ph
686              dbl_mb(ehfx_orb(1,ms)+i-1) =dbl_mb(ehfx_orb(1,ms)+i-1)-eh
687!$OMP END MASTER
688
689*             **** generate (Vij)*psi_r ***
690              call D3dBs_rr_Mul(1,real_mb(vij(1)),
691     >                             psi_r(index2),
692     >                             real_mb(tmp1(1)))
693              call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
694
695
696*             **** add -(Vij)*psi_r to Hpsi_r ***
697              call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),Hpsi_r(index1))
698
699              !**** include off diagonal terms ****
700              if (i.ne.j) then
701!$OMP MASTER
702                 ehfx = ehfx - eh
703                 phfx = phfx - ph
704                 dbl_mb(ehfx_orb(1,ms)+i-1) = dbl_mb(ehfx_orb(1,ms)+i-1)
705     >                                      - eh
706!$OMP END MASTER
707*                **** generate (Vij)*psi_r ***
708                 call D3dBs_rr_Mul(1,real_mb(vij(1)),
709     >                           psi_r(index1),
710     >                           real_mb(tmp2(1)))
711                 call D3dBs_r_Zero_Ends(1,real_mb(tmp2(1)))
712
713*                **** add -(Vij)*psi_r to Hpsi_r ***
714                 call D3dBs_rr_Sub2(1,real_mb(tmp2(1)),
715     >                               Hpsi_r(index2))
716              end if
717           end if
718           end if
719           k1 = k1 + 1
720         end do
721        end do
722        call Parallel_Vector_SumAll(norbs(ms),
723     >                        dbl_mb(ehfx_orb(1,ms)))
724        end do
725
726        end if
727
728        value =           BA_pop_stack(kcompute(2))
729        value = value.and.BA_pop_stack(tmp2(2))
730        value = value.and.BA_pop_stack(tmp1(2))
731        value = value.and.BA_pop_stack(vij(2))
732        value = value.and.BA_pop_stack(dn(2))
733        if (.not. value)
734     >    call errquit('pspw_potential_HFX:popping stack memory',0,
735     &       MA_ERR)
736
737         call Parallel_SumAll(ehfx)
738         call Parallel_SumAll(phfx)
739
740      end if
741
742      return
743      end
744
745
746*     *****************************
747*     *                           *
748*     *    pspw_energy_sHFX_sub   *
749*     *                           *
750*     *****************************
751      subroutine pspw_energy_sHFX_sub(ispin0,psi_r,ehfx_out,phfx_out)
752      implicit none
753      integer ispin0
754      real  psi_r(*)
755      real*8 ehfx_out
756      real*8 phfx_out
757
758#include "bafdecls.fh"
759#include "pspw_hfx.fh"
760#include "errquit.fh"
761
762*     **** local variables ****
763      logical value
764      integer i,j,n1,n2,n3,ms,k1
765      integer dn(2),tmp1(2),index1,index2
766      real    scal1,scal2,dv,seh
767      real*8  eh,ph,ss
768
769*     **** external functions ****
770      real*8   lattice_omega
771      external lattice_omega
772      logical  pspw_hfx_localize_closeenough
773      external pspw_hfx_localize_closeenough
774      real*8   pspw_hfx_localize_switchr
775      external pspw_hfx_localize_switchr
776      real     coulomb_screened_s_e
777      external coulomb_screened_s_e
778
779
780      if (((norbs(1)+norbs(2)).ne.0).and.(.not.relaxed)) then
781!$OMP MASTER
782        ehfx = 0.0d0
783        phfx = 0.0d0
784!$OMP END MASTER
785
786        call D3dB_nx(1,n1)
787        call D3dB_ny(1,n2)
788        call D3dB_nz(1,n3)
789        !call D3dB_n2ft3d(1,n2ft3d)
790        value = BA_push_get(mt_real,(2*n2ft3d),'dn_hfx',dn(2),dn(1))
791        value = value.and.
792     >          BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
793        if (.not. value) call errquit('out of stack memory',0, MA_ERR)
794        call Parallel_shared_vector_szero(.false.,
795     >                                     2*n2ft3d,real_mb(dn(1)))
796        call Parallel_shared_vector_szero(.true.,
797     >                                    n2ft3d,real_mb(tmp1(1)))
798
799        scal1 = real(1.0d0/dble(n1*n2*n3))
800        scal2 = real(1.0d0/lattice_omega())
801        dv = scal1/scal2
802
803        k1 = 1
804        do ms=1,ispin
805        do i=1,norbs(ms)
806!$OMP MASTER
807         dbl_mb(ehfx_orb(1,ms)+i-1) = 0.0d0
808!$OMP END MASTER
809         do j=1,i
810
811            if (mod(k1,npj).eq.taskid_j) then
812            if (pspw_hfx_localize_closeenough(
813     >                   int_mb(orbital_list(1,ms)+i-1),
814     >                   int_mb(orbital_list(1,ms)+j-1))) then
815
816              index1 = (int_mb(orbital_list(1,ms)+i-1)-1)*n2ft3d + 1
817              index2 = (int_mb(orbital_list(1,ms)+j-1)-1)*n2ft3d + 1
818
819              ss = pspw_hfx_localize_switchr(
820     >             int_mb(orbital_list(1,ms)+i-1),
821     >             int_mb(orbital_list(1,ms)+j-1))
822
823
824*             **** generate dnij ****
825              call D3dBs_rr_Mul(1,psi_r(index1),psi_r(index2),
826     >                         real_mb(dn(1)))
827              call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
828              call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
829
830*             ***** screened coulomb solver ****
831              if (solver_type.eq.1) then
832
833*               **** generate dng ****
834                call D3dBs_r_SMul1(1,scal1,real_mb(dn(1)))
835                call D3dBs_rc_pfft3f(1,0,real_mb(dn(1)))
836                call Packs_c_pack(0,real_mb(dn(1)))
837
838*               **** get Ecoul energy ****
839                eh = dble(coulomb_screened_s_e(real_mb(dn(1))))
840
841*             ***** free-space coulomb solver ****
842              else
843                 call coulomb2_s_v(real_mb(dn(1)),real_mb(tmp1(1)))
844                 call D3dBs_rr_dot(1,real_mb(dn(1)),
845     >                               real_mb(tmp1(1)),seh)
846                 eh = dble(0.5*seh*dv)
847              end if
848
849*             **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
850              eh = eh*hfx_parameter*ss
851              if (ispin0.eq.1) eh = eh + eh
852              ph = 2.0d0*eh
853
854!$OMP MASTER
855              ehfx = ehfx - eh
856              phfx = phfx - ph
857              dbl_mb(ehfx_orb(1,ms)+i-1) = dbl_mb(ehfx_orb(1,ms)+i-1)-eh
858!$OMP END MASTER
859
860              !**** include off diagonal terms ****
861              if (i.ne.j) then
862!$OMP MASTER
863                 ehfx = ehfx - eh
864                 phfx = phfx - ph
865                 dbl_mb(ehfx_orb(1,ms)+i-1) = dbl_mb(ehfx_orb(1,ms)+i-1)
866     >                                      - eh
867!$OMP END MASTER
868              end if
869
870            end if
871            end if
872            k1 = k1 + 1
873
874         end do
875        end do
876        call D1dB_Vector_SumAll(norbs(ms),dbl_mb(ehfx_orb(1,ms)))
877        end do
878
879        value =           BA_pop_stack(tmp1(2))
880        value = value.and.BA_pop_stack(dn(2))
881        if (.not. value)
882     >     call errquit('pspw_energy_HFX_sub:popping stack memory',0,
883     &       MA_ERR)
884
885        call D1dB_SumAll(ehfx)
886        call D1dB_SumAll(phfx)
887      end if
888      ehfx_out = ehfx
889      phfx_out = phfx
890
891      return
892      end
893
894
895
896
897*     ************************************
898*     *                    	         *
899*     *     pspw_potential_HFX_orb_sub   *
900*     *                                  *
901*     ************************************
902*
903*    Note that orb_r and Horb_r are assumed to be replicated rather than psi_r
904*    orb_r is not replicated in this routine
905*    Horb_r is not reduced in this routine
906*
907      subroutine pspw_potential_sHFX_orb_sub(ms,psi_r,
908     >                                      orb_r,Horb_r)
909      implicit none
910      integer    ms
911      real  psi_r(*)
912      real  orb_r(*)
913      real  Horb_r(*)
914
915#include "bafdecls.fh"
916#include "pspw_hfx.fh"
917#include "errquit.fh"
918
919*     **** local variables ****
920      logical value
921      integer j,n1,n2,n3,q2,p2
922      integer dn(2),vij(2),tmp1(2),tmp2(2),index2
923      real    scal1,scal2,dv,seh
924      real*8  eh,ph
925
926*     **** external functions ****
927      real*8   lattice_omega
928      external lattice_omega
929      real     coulomb_screened_s_e
930      external coulomb_screened_s_e
931
932      if (((norbs(1)+norbs(2)).ne.0).and.relaxed) then
933        call D3dB_nx(1,n1)
934        call D3dB_ny(1,n2)
935        call D3dB_nz(1,n3)
936        !call D3dB_n2ft3d(1,n2ft3d)
937        value = BA_push_get(mt_real,(n2ft3d),'dn_hfx',dn(2),dn(1))
938        value = value.and.
939     >          BA_push_get(mt_real,(n2ft3d),'vij_hfx',vij(2),vij(1))
940        value = value.and.
941     >          BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
942        value = value.and.
943     >          BA_push_get(mt_real,(n2ft3d),'tmp2_hfx',tmp2(2),tmp2(1))
944        if (.not. value) call errquit('out of stack memory',0, MA_ERR)
945        call Parallel_shared_vector_szero(.false.,n2ft3d,
946     >                                    real_mb(dn(1)))
947        call Parallel_shared_vector_szero(.false.,n2ft3d,
948     >                                    real_mb(vij(1)))
949        call Parallel_shared_vector_szero(.false.,n2ft3d,
950     >                                    real_mb(tmp1(1)))
951        call Parallel_shared_vector_szero(.true.,n2ft3d,
952     >                                    real_mb(tmp2(1)))
953
954        scal1 = real(1.0d0/dble(n1*n2*n3))
955        scal2 = real(1.0d0/lattice_omega())
956        dv = scal1/scal2
957
958        do j=1,norbs(ms)
959           call Dneall_ntoqp(int_mb(orbital_list(1,ms)+j-1),q2,p2)
960           index2 = (q2-1)*n2ft3d + 1
961
962           if (p2.eq.taskid_j) then
963*             **** generate dnij for Vij  ****
964              call D3dBs_rr_Mul(1,psi_r(index2),orb_r,real_mb(dn(1)))
965              call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
966              call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
967
968*             ***** screened coulomb solver ****
969              if (solver_type.eq.1) then
970                call D3dBs_r_SMul1(1,scal1,real_mb(dn(1)))
971                call D3dBs_rc_pfft3f(1,0,real_mb(dn(1)))
972                call Packs_c_pack(0,real_mb(dn(1)))
973
974*               **** get Ecoul energy ****
975                eh = dble(coulomb_screened_s_e(real_mb(dn(1))))
976
977*               **** generate Vcoul ****
978                call coulomb_screened_s_v(real_mb(dn(1)),
979     >                                    real_mb(vij(1)))
980                call Packs_c_unpack(0,real_mb(vij(1)))
981                call D3dBs_cr_pfft3b(1,0,real_mb(vij(1)))
982
983*             ***** free-space coulomb solver ****
984              else
985                 call coulomb2_s_v(real_mb(dn(1)),real_mb(vij(1)))
986                 call D3dBs_rr_dot(1,real_mb(dn(1)),
987     >                               real_mb(vij(1)),seh)
988                 eh = dble(0.5*seh*dv)
989              end if
990
991*             **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
992              eh = eh*hfx_parameter
993              call D3dBs_r_SMul1(1,hfx_parameter,real_mb(vij(1)))
994              if (ispin.eq.1) eh = eh + eh
995              ph = 2.0d0*eh
996
997
998*             **** generate (Vij)*psi_r ***
999              call D3dBs_rr_Mul(1,real_mb(vij(1)),
1000     >                           psi_r(index2),
1001     >                           real_mb(tmp1(1)))
1002              call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
1003
1004*             **** add -(Vij)*psi_r to Hpsi_r ***
1005              call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),Horb_r)
1006           end if
1007        end do
1008        value =           BA_pop_stack(tmp2(2))
1009        value = value.and.BA_pop_stack(tmp1(2))
1010        value = value.and.BA_pop_stack(vij(2))
1011        value = value.and.BA_pop_stack(dn(2))
1012        if (.not. value)
1013     >  call errquit('pspw_potential_sHFX_orb_sub:popping stack memory',
1014     >               0,MA_ERR)
1015
1016c        **** eh and ph not used yet ****
1017c        call D1dB_SumAll(eh)
1018c        call D1dB_SumAll(ph)
1019      end if
1020      return
1021      end
1022
1023
1024
1025*     ***********************************
1026*     *                    	        *
1027*     *    pspw_potential_HFX_sub2      *
1028*     *                                 *
1029*     ***********************************
1030*
1031*   This routine is a kernel for computing exact exchange.
1032*
1033*    for i=istart:iend
1034*    for j=jstart:jend
1035*       dnij(*) = psi_r(*,j) .* psi_r(*,i)
1036*       Vij(*)  = Coulomb operator(dnij(*))
1037*       Hpsi_r(*,i) = Vij(*) .* psi_r(*,j)
1038*       Hpsi_r(*,j) = Vij(*) .* psi_r(*,i)
1039*       ehfx += 0.5*<psi_r(*,i)|Hpsi(*,i)>
1040*             + 0.5*<psi_r(*,j)|Hpsi(*,j)>
1041*
1042*   Entry - solver_type: if solver_type==1 then periodic solver, else aperiodic solver
1043*           istart,iend: indexes
1044*           jstart,jend: indexes
1045*           imodn,imodtask: used to define which (i,j) combinations are computed.
1046*           n2ft3d: size of realspace grid
1047*           psi_r: wavenfucntions in realspace.
1048*           ehfx: running sum of exchange energy, not initialized in this routine
1049*
1050*   Exit - Hpsi_r: wavefunction gradients in realspace.
1051*          ehfx: running sum of exchange energy.
1052
1053      subroutine pspw_potential_sHFX_sub2(solver_type,
1054     >                                   istart,iend,
1055     >                                   jstart,jend,
1056     >                                   imodn,imodtask,
1057     >                                   n2ft3d,psi_r,Hpsi_r,
1058     >                                   ehfx)
1059      implicit none
1060      integer solver_type
1061      integer istart,iend,jstart,jend
1062      integer imodn,imodtask
1063      integer n2ft3d
1064      real  psi_r(n2ft3d,*)
1065      real  Hpsi_r(n2ft3d,*)
1066      real*8  ehfx
1067
1068#include "bafdecls.fh"
1069#include "errquit.fh"
1070
1071      integer taskid_j
1072
1073*     **** local variables ****
1074      logical value,done,special
1075      integer n1,n2,n3
1076      integer dn(2),vij(2),tmp1(2)
1077      integer i1,j1,k1,NN
1078      integer i2,j2,k2
1079      integer i3,j3,k3
1080      real  scal1,scal2,dv,seh
1081      real*8  eh,ph
1082
1083*     **** external functions ****
1084      real*8   lattice_omega
1085      external lattice_omega
1086      logical  D3dBs_rc_pfft3_queue_filled,D3dBs_cr_pfft3_queue_filled
1087      external D3dBs_rc_pfft3_queue_filled,D3dBs_cr_pfft3_queue_filled
1088      real     icoulomb_screened_s_e
1089      external icoulomb_screened_s_e
1090
1091      call Parallel2d_taskid_j(taskid_j)
1092
1093      special = ((istart.eq.jstart).and.(iend.eq.jend))
1094
1095      call D3dB_nx(1,n1)
1096      call D3dB_ny(1,n2)
1097      call D3dB_nz(1,n3)
1098      value = BA_push_get(mt_real,(n2ft3d),'dn_hfx',dn(2),dn(1))
1099      value = value.and.
1100     >        BA_push_get(mt_real,(n2ft3d),'vij_hfx',vij(2),vij(1))
1101      value = value.and.
1102     >        BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
1103      if (.not. value)
1104     >   call errquit('pspw_potential_sHFX_sub:out of stack',0,MA_ERR)
1105      call Parallel_shared_vector_szero(.false.,
1106     >                                  n2ft3d,real_mb(dn(1)))
1107      call Parallel_shared_vector_szero(.false.,
1108     >                                  n2ft3d,real_mb(vij(1)))
1109      call Parallel_shared_vector_szero(.true.,
1110     >                                  n2ft3d,real_mb(tmp1(1)))
1111
1112      scal1 = real(1.0d0/dble(n1*n2*n3))
1113      scal2 = real(1.0d0/lattice_omega())
1114      dv = scal1/scal2
1115
1116*     *** special if i and j span the same indexes ***
1117      if (special) then
1118         NN = (iend-istart+1)*(jend-jstart+2)/2
1119      else
1120         NN = (iend-istart+1)*(jend-jstart+1)
1121      end if
1122
1123*     ***** screened coulomb solver ****
1124      if (solver_type.eq.1) then
1125        i1 = istart
1126        j1 = jstart
1127        k1 = 1
1128
1129        i2 = istart
1130        j2 = jstart
1131        k2 = 1
1132
1133        i3 = istart
1134        j3 = jstart
1135        k3 = 1
1136        done = .false.
1137        do while (.not.done)
1138
1139*          *** pipeline step 1 ***
1140           if (k1.le.NN) then
1141
1142              if (mod(k1,imodn).eq.imodtask) then
1143
1144*                **** generate dnij for Vij  ****
1145                 call D3dBs_rr_Mul(1,psi_r(1,j1),
1146     >                              psi_r(1,i1),real_mb(dn(1)))
1147                 call D3dBs_r_SMul1(1,scal2*scal1,real_mb(dn(1)))
1148                 call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
1149                 call D3dBs_rc_pfft3f_queuein(0,real_mb(dn(1)))
1150
1151              end if
1152
1153              k1 = k1 + 1
1154              j1 = j1 + 1
1155              if (special) then
1156                 if (j1.gt.i1) then
1157                    j1 = jstart
1158                    i1 = i1 + 1
1159                 end if
1160              else
1161                 if (j1.gt.jend) then
1162                    j1 = jstart
1163                    i1 = i1 + 1
1164                 end if
1165              end if
1166           end if
1167
1168*          *** pipeline step 2 ***
1169           if (     ((D3dBs_rc_pfft3_queue_filled()).or.(k1.gt.NN))
1170     >         .and.(k2.le.NN)) then
1171
1172              if (mod(k2,imodn).eq.imodtask) then
1173                 call D3dBs_rc_pfft3f_queueout(0,real_mb(dn(1)))
1174
1175*                **** generate Vcoul ****
1176                 eh = dble(icoulomb_screened_s_e(real_mb(dn(1))))
1177                 call coulomb_screened_s_v(real_mb(dn(1)),
1178     >                                     real_mb(vij(1)))
1179
1180
1181*                **** calculcate ph ****
1182!OMP MASTER
1183                 ehfx = ehfx - eh
1184!OMP END MASTER
1185
1186*                **** include transpose ***
1187                 if ((i2.ne.j2).or.(.not.special)) then
1188!OMP MASTER
1189                    ehfx = ehfx - eh
1190!OMP END MASTER
1191                 end if
1192
1193                 call D3dBs_cr_pfft3b_queuein(0,real_mb(vij(1)))
1194              end if
1195
1196              k2 = k2 + 1
1197              j2 = j2 + 1
1198              if (special) then
1199                 if (j2.gt.i2) then
1200                    j2 = jstart
1201                    i2 = i2 + 1
1202                 end if
1203              else
1204                 if (j2.gt.jend) then
1205                    j2 = jstart
1206                    i2 = i2 + 1
1207                 end if
1208              end if
1209
1210           end if
1211
1212*          *** pipeline step 3 ***
1213           if ((D3dBs_cr_pfft3_queue_filled()).or.(k2.gt.NN)) then
1214
1215              if (mod(k3,imodn).eq.imodtask) then
1216                 call D3dBs_cr_pfft3b_queueout(0,real_mb(vij(1)))
1217
1218*                **** generate (Vij)*psi_r ***
1219                 call D3dBs_rr_Mul(1,real_mb(vij(1)),
1220     >                              psi_r(1,j3),
1221     >                              real_mb(tmp1(1)))
1222                 call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
1223
1224*                **** add -(Vij)*psi_r to Hpsi_r ***
1225                 call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),Hpsi_r(1,i3))
1226
1227                 !**** include transpose ****
1228                 if ((i3.ne.j3).or.(.not.special)) then
1229
1230*                   **** generate (Vij)*psi_r ***
1231                    call D3dBs_rr_Mul(1,real_mb(vij(1)),
1232     >                              psi_r(1,i3),
1233     >                              real_mb(tmp1(1)))
1234                    call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
1235
1236*                   **** add -(Vij)*psi_r to Hpsi_r ***
1237                    call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),
1238     >                                Hpsi_r(1,j3))
1239                 end if
1240              endif
1241
1242              k3 = k3 + 1
1243              j3 = j3 + 1
1244              if (special) then
1245                 if (j3.gt.i3) then
1246                    j3 = jstart
1247                    i3 = i3 + 1
1248                 end if
1249              else
1250                 if (j3.gt.jend) then
1251                    j3 = jstart
1252                    i3 = i3 + 1
1253                 end if
1254              end if
1255
1256           end if
1257           done = (k1.gt.NN).and.(k2.gt.NN).and.(k3.gt.NN)
1258        end do !**** while ****
1259
1260
1261*     ***** free-space coulomb solver -- not pipelined ****
1262      else
1263         k1 = 1
1264         i1 = istart
1265         j1 = jstart
1266         done = .false.
1267         do while (.not.done)
1268
1269            if (mod(k1,imodn).eq.imodtask) then
1270
1271*              **** generate dnij for Vij  ****
1272               call D3dBs_rr_Mul(1,psi_r(1,j1),psi_r(1,i1),
1273     >                          real_mb(dn(1)))
1274               call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
1275               call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
1276
1277               call coulomb2_s_v(real_mb(dn(1)),real_mb(vij(1)))
1278               call D3dBs_rr_idot(1,real_mb(dn(1)),
1279     >                              real_mb(vij(1)),seh)
1280               eh = dble(0.5*seh*dv)
1281
1282*              **** calculcate ph ****
1283!OMP MASTER
1284               ehfx = ehfx - eh
1285!OMP END MASTER
1286
1287*              **** generate (Vij)*psi_r ***
1288               call D3dBs_rr_Mul(1,real_mb(vij(1)),
1289     >                            psi_r(1,j1),
1290     >                            real_mb(tmp1(1)))
1291               call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
1292
1293*              **** add -(Vij)*psi_r to Hpsi_r ***
1294               call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),Hpsi_r(1,i1))
1295
1296               !**** include transpose terms ****
1297               if ((i1.ne.j1).or.(.not.special)) then
1298!OMP MASTER
1299                  ehfx = ehfx - eh
1300!OMP END MASTER
1301
1302*                 **** generate (Vij)*psi_r ***
1303                  call D3dBs_rr_Mul(1,real_mb(vij(1)),
1304     >                            psi_r(1,i1),
1305     >                            real_mb(tmp1(1)))
1306                  call D3dBs_r_Zero_Ends(1,real_mb(tmp1(1)))
1307
1308*                 **** add -(Vij)*psi_r to Hpsi_r ***
1309                  call D3dBs_rr_Sub2(1,real_mb(tmp1(1)),
1310     >                                Hpsi_r(1,j1))
1311               end if
1312
1313            end if
1314
1315            k1 = k1 + 1
1316            j1 = j1 + 1
1317            if (special) then
1318               if (j1.gt.i1) then
1319                  j1 = jstart
1320                  i1 = i1 + 1
1321               end if
1322            else
1323               if (j1.gt.jend) then
1324                  j1 = jstart
1325                  i1 = i1 + 1
1326               end if
1327            end if
1328            done = (k1.gt.NN)
1329         end do
1330
1331      end if
1332
1333      !**** deallocate memory ****
1334      value =           BA_pop_stack(tmp1(2))
1335      value = value.and.BA_pop_stack(vij(2))
1336      value = value.and.BA_pop_stack(dn(2))
1337      if (.not. value) call errquit(
1338     >   'pspw_potential_sHFX_sub2:popping stack memory',0,MA_ERR)
1339
1340      return
1341      end
1342
1343
1344
1345*     *******************************************
1346*     *                    	                *
1347*     *     pspw_potential_sHFX_orb_replicated   *
1348*     *                                         *
1349*     *******************************************
1350      subroutine pspw_potential_sHFX_orb_replicated(ms,psi_r,
1351     >                                  orb_r,Horb_r)
1352      implicit none
1353      integer    ms
1354      real*8     psi_r(*)
1355      real*8     orb_r(*)
1356      real*8     Horb_r(*)
1357
1358#include "bafdecls.fh"
1359#include "pspw_hfx.fh"
1360#include "errquit.fh"
1361
1362
1363      call nwpw_timing_start(33)
1364      if ((norbs(ms).ne.0).and.relaxed) then
1365         if (replicated) then
1366            call Parallel_shared_vector_scopy(.true.,n2ft3d,
1367     >                                  real_mb(Hpsi_r_replicated(1)))
1368            call pspw_potential_HFX_orb_sub(ms,psi_r,orb_r,
1369     >                                  real_mb(Hpsi_r_replicated(1)))
1370            call D1dBs_Vector_SumAll(n2ft3d,
1371     >                               real_mb(Hpsi_r_replicated(1)))
1372            call SAXPY_OMP(n2ft3d,1.0d0,real_mb(Hpsi_r_replicated(1)),1,
1373     >                 Horb_r,1)
1374
1375         else
1376            call pspw_potential_sHFX_orb_sub(ms,psi_r,orb_r,Horb_r)
1377         end if
1378
1379      end if
1380      call nwpw_timing_end(33)
1381      return
1382      end
1383
1384
1385
1386
1387
1388
1389*    ***** routines below used for pspw_et calculations ****
1390
1391
1392*     *************************
1393*     *                       *
1394*     *    pspw_energy_sHFX2  *
1395*     *                       *
1396*     *************************
1397      subroutine pspw_energy_sHFX2(ispin0,psi1_r,psi2_r,
1398     >                            ehfx_out,phfx_out)
1399      implicit none
1400      integer ispin0
1401      real   psi1_r(*),psi2_r(*)
1402      real*8 ehfx_out
1403      real*8 phfx_out
1404
1405#include "bafdecls.fh"
1406#include "pspw_hfx.fh"
1407#include "errquit.fh"
1408
1409      integer q,n,indx1,indx2
1410
1411      call nwpw_timing_start(33)
1412
1413c     **** calculate HFX energy  ****
1414      if ((norbs(1)+norbs(2)).ne.0) then
1415
1416         if (replicated) then
1417            call Parallel_shared_vector_szero(.false.,nrsize,
1418     >                                 real_mb(psi_r_replicated(1)))
1419            call Parallel_shared_vector_szero(.true.,nrsize,
1420     >                                 real_mb(Hpsi_r_replicated(1)))
1421            do q=1,neqall
1422               call Dneall_qton(q,n)
1423               indx1 = (q-1)*n2ft3d + 1
1424               indx2 = psi_r_replicated(1)+(n-1)*n2ft3d
1425               call Parallel_shared_vector_scopy(.false.,n2ft3d,
1426     >                                psi1_r(indx1),real_mb(indx2))
1427
1428               indx2 = Hpsi_r_replicated(1)+(n-1)*n2ft3d
1429               call Parallel_shared_vector_scopy(.true.,n2ft3d,
1430     >                            psi2_r(indx1),real_mb(indx2))
1431            end do
1432            call D1dBs_Vector_SumAll(nrsize,
1433     >                               real_mb(psi_r_replicated(1)))
1434            call D1dBs_Vector_SumAll(nrsize,
1435     >                              real_mb(Hpsi_r_replicated(1)))
1436            call pspw_energy_sHFX2_sub(ispin0,
1437     >                               real_mb(psi_r_replicated(1)),
1438     >                               real_mb(Hpsi_r_replicated(1)),
1439     >                               ehfx_out,phfx_out)
1440
1441         else
1442            call pspw_energy_sHFX2_sub(ispin0,psi1_r,psi2_r,
1443     >                                ehfx_out,phfx_out)
1444         end if
1445
1446c     **** nothing to do ****
1447      else
1448         ehfx_out = ehfx
1449         phfx_out = phfx
1450      end if
1451      call nwpw_timing_end(33)
1452
1453      return
1454      end
1455
1456
1457
1458*     *****************************
1459*     *                           *
1460*     *    pspw_energy_sHFX2_sub  *
1461*     *                           *
1462*     *****************************
1463      subroutine pspw_energy_sHFX2_sub(ispin0,psi1_r,psi2_r,
1464     >                               ehfx_out,phfx_out)
1465      implicit none
1466      integer ispin0
1467      real   psi1_r(*)
1468      real   psi2_r(*)
1469      real*8 ehfx_out
1470      real*8 phfx_out
1471
1472#include "bafdecls.fh"
1473#include "pspw_hfx.fh"
1474#include "errquit.fh"
1475
1476*     **** local variables ****
1477      logical value
1478      integer i,j,n1,n2,n3,ms,k1
1479      integer dn(2),tmp1(2),index1,index2
1480      real    scal1,scal2,dv,seh
1481      real*8  eh,ph
1482
1483*     **** external functions ****
1484      real*8   lattice_omega,coulomb_screened_s_e
1485      external lattice_omega,coulomb_screened_s_e
1486
1487      integer taskid
1488
1489      call Parallel_taskid(taskid)
1490
1491
1492      if ((norbs(1)+norbs(2)).ne.0) then
1493!OMP MASTER
1494        ehfx = 0.0d0
1495        phfx = 0.0d0
1496!OMP END MASTER
1497
1498        call D3dB_nx(1,n1)
1499        call D3dB_ny(1,n2)
1500        call D3dB_nz(1,n3)
1501        !call D3dB_n2ft3d(1,n2ft3d)
1502        value = BA_push_get(mt_real,(2*n2ft3d),'dn_hfx',dn(2),dn(1))
1503        value = value.and.
1504     >          BA_push_get(mt_real,(n2ft3d),'tmp1_hfx',tmp1(2),tmp1(1))
1505        if (.not. value) call errquit('out of stack memory',0, MA_ERR)
1506        call Parallel_shared_vector_szero(.false.,
1507     >                                    2*n2ft3d,real_mb(dn(1)),1)
1508        call Parallel_shared_vector_szero(.true.,
1509     >                                    n2ft3d,real_mb(tmp1(1)),1)
1510
1511        scal1 = real(1.0d0/dble(n1*n2*n3))
1512        scal2 = real(1.0d0/lattice_omega())
1513        dv = scal1/scal2
1514
1515        k1 = 1
1516        do ms=1,ispin
1517        do i=1,norbs(ms)
1518!OMP MASTER
1519         dbl_mb(ehfx_orb(1,ms)+i-1) = 0.0d0
1520!OMP END MASTER
1521         do j=1,i
1522
1523            if (mod(k1,npj).eq.taskid_j) then
1524              index1 = (int_mb(orbital_list(1,ms)+i-1)-1)*n2ft3d + 1
1525              index2 = (int_mb(orbital_list(1,ms)+j-1)-1)*n2ft3d + 1
1526
1527*             **** generate dnij ****
1528              call D3dBs_rr_Mul(1,psi1_r(index1),psi2_r(index2),
1529     >                         real_mb(dn(1)))
1530              call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)))
1531              call D3dBs_r_Zero_Ends(1,real_mb(dn(1)))
1532
1533*             **** generate  dnji****
1534              call D3dBs_rr_Mul(1,psi1_r(index2),psi2_r(index1),
1535     >                         real_mb(dn(1)+n2ft3d))
1536              call D3dBs_r_SMul1(1,scal2,real_mb(dn(1)+n2ft3d))
1537              call D3dBs_r_Zero_Ends(1,real_mb(dn(1)+n2ft3d))
1538
1539*             ***** screened coulomb solver ****
1540              if (solver_type.eq.1) then
1541
1542*               **** generate dng ****
1543                call D3dBs_r_SMul1(1,scal1,real_mb(dn(1)))
1544                call D3dBs_rc_pfft3f(1,0,real_mb(dn(1)))
1545                call Packs_c_pack(0,real_mb(dn(1)))
1546
1547                call D3dB_r_SMul1(1,scal1,real_mb(dn(1)+n2ft3d))
1548                call D3dB_rc_pfft3f(1,0,real_mb(dn(1)+n2ft3d))
1549                call Packs_c_pack(0,real_mb(dn(1)+n2ft3d))
1550
1551*               **** get Ecoul energy ****
1552                call coulomb_screened_s_v(real_mb(dn(1)),
1553     >                                    real_mb(tmp1(1)))
1554                call Packs_cc_dot(0,real_mb(dn(1)+n2ft3d),
1555     >                              real_mb(tmp1(1)),seh)
1556                eh = dble(0.5*seh*lattice_omega())
1557
1558*             ***** free-space coulomb solver ****
1559              else
1560                 call coulomb2_s_v(real_mb(dn(1)),real_mb(tmp1(1)))
1561                 call D3dBs_rr_dot(1,real_mb(dn(1)+n2ft3d),
1562     >                            real_mb(tmp1(1)),seh)
1563                 eh = dble(0.5*seh*dv)
1564
1565                 !if (taskid.eq.0) write(*,*) " - ms,i,j,ehfx=",ms,i,j,eh
1566
1567              end if
1568
1569*             **** apply hfx_parameter, double eh for restricted, and calculcate ph ****
1570              eh = eh*hfx_parameter
1571              if (ispin0.eq.1) eh = eh + eh
1572              ph = 2.0d0*eh
1573
1574!OMP MASTER
1575              ehfx = ehfx - eh
1576              phfx = phfx - ph
1577              dbl_mb(ehfx_orb(1,ms)+i-1) = dbl_mb(ehfx_orb(1,ms)+i-1)-eh
1578!OMP END MASTER
1579
1580              !**** include off diagonal terms ****
1581              if (i.ne.j) then
1582!OMP MASTER
1583                 ehfx = ehfx - eh
1584                 phfx = phfx - ph
1585                 dbl_mb(ehfx_orb(1,ms)+i-1) = dbl_mb(ehfx_orb(1,ms)+i-1)
1586     >                                      - eh
1587!OMP END MASTER
1588              end if
1589
1590            end if
1591            k1 = k1 + 1
1592
1593         end do
1594        end do
1595        call D1dB_Vector_SumAll(norbs(ms),dbl_mb(ehfx_orb(1,ms)))
1596        end do
1597
1598        value =           BA_pop_stack(tmp1(2))
1599        value = value.and.BA_pop_stack(dn(2))
1600        if (.not. value)
1601     >     call errquit('pspw_energy_sHFX2_sub:popping stack memory',0,
1602     >       MA_ERR)
1603
1604        call D1dB_SumAll(ehfx)
1605        call D1dB_SumAll(phfx)
1606      end if
1607      ehfx_out = ehfx
1608      phfx_out = phfx
1609
1610      return
1611      end
1612
1613
1614
1615
1616