1!=================================================================================
2!
3! Routines:
4!
5! (1) w_sum()         Originally By JRD       Last Modified 4/1/2012 (JRD)
6!
7!  Multiply Valence-Valence matrix elements by W to create temporary arrays
8!  for the head, wing and body.
9!
10!  This routine scales as N^3, but is nested within the the igp loop
11!  in mtxel_kernel. Thus, it represents an N^4 step. Doing the multiplication
12!  here is cheaper than doing it in the N^5 g_sum subroutine.
13!
14!=================================================================================
15
16#include "f_defs.h"
17
18module w_sum_m
19
20  use global_m
21  implicit none
22
23  public :: w_sum
24
25  private
26
27contains
28
29  subroutine w_sum(xct,wptcol,ofs1,ofs1p,n1,n1p,temph,tempw,tempb,m11p,indinvigp,ng_eps)
30    type (xctinfo), intent(in) :: xct
31    SCALAR, intent(in) :: wptcol(:)
32    !> offset (i.e., add this number to map a local index to the global band index)
33    integer, intent(in) :: ofs1, ofs1p
34    !> number of bands for each wfn
35    integer, intent(in) :: n1, n1p
36    SCALAR, intent(inout) :: tempw(:,:,:,:), tempb(:,:,:,:), temph(:,:,:), m11p(:,:,:,:)
37    integer, intent(in) :: indinvigp
38    integer, intent(in) :: ng_eps
39
40    SCALAR, allocatable :: m11p_conj(:,:,:)
41    integer :: isv, ig, i1, i1p, gi1, gi1p
42
43    PUSH_SUB(w_sum)
44
45    ! JRD: We allocate a new temporary array in order to get better cache performance
46    SAFE_ALLOCATE(m11p_conj,(n1,n1p,xct%nspin))
47    m11p_conj(:,:,:) = MYCONJG(m11p(indinvigp,:,:,:))
48
49    do isv=1,xct%nspin
50      if (indinvigp .eq. 1) then
51        temph(ofs1+1:ofs1+n1, ofs1p+1:ofs1p+n1p, isv) = wptcol(1)*m11p_conj(1:n1, 1:n1p, isv)
52
53        !$OMP PARALLEL PRIVATE(i1p, gi1p, i1, gi1, ig) SHARED(wptcol, tempb, m11p_conj)
54        do i1p = 1, n1p
55          gi1p = ofs1p+i1p
56          do i1 = 1, n1
57            gi1 = ofs1+i1
58            !$OMP DO
59            do ig=2,ng_eps
60              tempw(ig, gi1, gi1p, isv) = wptcol(ig) * m11p_conj(i1, i1p, isv)
61            enddo
62            !$OMP END DO
63          enddo
64        enddo
65        !$OMP END PARALLEL
66
67      else
68
69        tempw(1, ofs1+1:ofs1+n1, ofs1p+1:ofs1p+n1p, isv) = tempw(1, ofs1+1:ofs1+n1, ofs1p+1:ofs1p+n1p, isv) + &
70          wptcol(1)*m11p_conj(1:n1, 1:n1p, isv)
71
72        !$OMP PARALLEL PRIVATE(i1p, gi1p, i1, gi1, ig) SHARED(wptcol, tempb, m11p_conj)
73        do i1p = 1, n1p
74          gi1p = ofs1p+i1p
75          do i1 = 1, n1
76            gi1 = ofs1+i1
77            !$OMP DO
78            do ig=2,ng_eps
79              tempb(ig, gi1, gi1p, isv) = tempb(ig, gi1, gi1p, isv) + &
80                wptcol(ig) * m11p_conj(i1, i1p, isv)
81            enddo
82            !$OMP END DO NOWAIT
83          enddo
84        enddo
85        !$OMP END PARALLEL
86
87      endif
88
89    enddo
90
91    SAFE_DEALLOCATE(m11p_conj)
92
93    POP_SUB(w_sum)
94
95  end subroutine w_sum
96
97end module w_sum_m
98