! --- Tangled code
module m_redist_spmatrix
#ifdef SIESTA__PEXSI
 implicit none
 type, public :: comm_t
    integer :: src, dst, i1, i2, nitems
 end type comm_t
 
 integer, parameter, private :: dp = selected_real_kind(10,100)
 
 type, public :: dp_pointer
    ! boxed array pointer type
    real(dp), pointer :: data(:) => null()
 end type dp_pointer
 
 type, public ::  aux_matrix
    integer :: norbs = -1
    integer :: no_l  = -1
    integer :: nnzl  = -1
    integer, pointer :: numcols(:) => null()
    integer, pointer :: cols(:)    => null()
    ! array of 1D pointers
    type(dp_pointer), dimension(:), pointer :: vals(:) => null()
 end type aux_matrix
 public :: redistribute_spmatrix
CONTAINS
 subroutine redistribute_spmatrix(norbs,m1,dist1,m2,dist2,bridge_comm)
 
   use mpi
   use class_Distribution
   use alloc,       only: re_alloc, de_alloc
 
   implicit none
 
   integer, intent(in)       :: norbs   ! Overall number of rows
   type(aux_matrix) :: m1               ! Source matrix
   type(aux_matrix) :: m2               ! Destination matrix -- it is allocated
   type(Distribution) :: dist1, dist2           ! Distributions
   integer, intent(in)   :: bridge_comm    ! Umbrella Communicator
 
   type(comm_t), dimension(:), allocatable, target :: comms
   type(comm_t), dimension(:), allocatable, target :: commsnnz
   type(comm_t), pointer :: c, cnnz
 
   integer ::  myrank1, myrank2, myid, gg
   logical ::  proc_in_set1, proc_in_set2
   integer ::  ierr
 
   integer ::  i, io, g1, g2, j, nvals
   integer ::  comparison, n1, n2, c1, c2
   integer, parameter :: dp = selected_real_kind(10,100)
   real(dp), dimension(:), pointer  :: data1 => null(), data2 => null()
 
   integer, allocatable :: ranks1(:), ranks2(:)
 
   ! The communicators are a sanity check on the ranks
 
   call mpi_comm_rank(bridge_comm,myid,ierr)
   c1 = ref_comm(dist1)
   c2 = ref_comm(dist2)
   call MPI_Comm_Compare(c1,c2,comparison,ierr)
 
   select case (comparison)
   case (MPI_IDENT)
      ! Communicators are identical
   case (MPI_CONGRUENT)
      ! Communicators have the same group and rank order, but they differ in context
   case (MPI_SIMILAR)
      ! Rank order is different
      call die("Different rank order in communicators")
   case (MPI_UNEQUAL)
      ! Big mess
      call die("Incompatible distribution reference communicators")
   end select
 
   ! Now check congruence with the provided bridge_comm
   
   call MPI_Comm_Compare(c1,bridge_comm,comparison,ierr)
   select case (comparison)
   case (MPI_IDENT)
      ! Communicators are identical
   case (MPI_CONGRUENT)
      ! Communicators have the same group and rank order, but they differ in context
      ! We will use bridge_comm
   case (MPI_SIMILAR)
      ! Rank order is different
      call die("Different rank order in dist communicators and bridge comm")
   case (MPI_UNEQUAL)
      ! Big mess
      call die("Incompatible bridge and dist communicators")
   end select
 
   ! Now create groups g1 and g2.
   ! (DO NOT trust the internal handles)
   call MPI_Comm_Group(bridge_comm,gg,ierr)
   call get_ranks_in_ref_comm(dist1, ranks1)
   call get_ranks_in_ref_comm(dist2, ranks2)
   n1 = size(ranks1)
   n2 = size(ranks2)
   call MPI_Group_Incl(gg,n1,ranks1,g1,ierr)
   call MPI_Group_Incl(gg,n2,ranks2,g2,ierr)
 
   ! The rest is the same as before
 
   call mpi_group_rank(g1,myrank1,ierr)
   call mpi_group_rank(g2,myrank2,ierr)
 
   proc_in_set1 = (myrank1 /= MPI_UNDEFINED)
   proc_in_set2 = (myrank2 /= MPI_UNDEFINED)
 
   if (proc_in_set1 .or. proc_in_set2) then
     print "(a,3i6,2l2)", "world_rank, rank1, rank2, ing1?, ing2?", myid,  &
        myrank1, myrank2, proc_in_set1, proc_in_set2
   endif
 
   ! Figure out the communication needs
   call analyze_comms()
 
   ! In preparation for the transfer, we allocate
   ! storage for the second group of processors
   ! Note that m2%numcols (and, in general, any of the 2nd set 
   ! of arrays), will not be allocated by those processors
   ! not in the second set.
 
 
   if (proc_in_set2) then
      m2%norbs = norbs
      m2%no_l = num_local_elements(dist2,norbs,myrank2)
      call re_alloc(m2%numcols,1,m2%no_l,"m2%numcols","redistribute_spmatrix")
   endif
 
   if (myid == 0) print *, "About to transfer numcols..."
   call do_transfers_int(comms,m1%numcols,m2%numcols, &
        g1,g2,bridge_comm)
   if (myid == 0) print *, "Transferred numcols."
 
   ! We need to tell the processes in set 2 how many
   ! "vals" to expect.
   if (proc_in_set1) then
      if (associated(m1%vals)) then
         nvals = size(m1%vals)
      else
         nvals = 0
      endif
   endif
   ! Now do a broadcast within bridge_comm, using as root one
   ! process in the first set. Let's say the one with rank 0
   ! in g1, the first in the set, which will have rank=ranks1(1)
   ! in bridge_comm
   call MPI_Bcast(nvals,1,MPI_Integer,ranks1(1),bridge_comm,ierr)
 
   ! Now we can figure out how many non-zeros there are
   if (proc_in_set2) then
      m2%nnzl = sum(m2%numcols(1:m2%no_l))
      call re_alloc(m2%cols,1,m2%nnzl,"m2%cols","redistribute_spmatrix")
 
      if (nvals > 0) then
         allocate(m2%vals(nvals))
         do j=1,nvals
            call re_alloc(m2%vals(j)%data,1,m2%nnzl, &
                 "m2%vals(j)%data","redistribute_spmatrix")
         enddo
      endif
 
   endif
 
   ! Generate a new comms-structure with new start/count indexes
 
   allocate(commsnnz(size(comms)))
   do i = 1, size(comms)
      c => comms(i)
      cnnz => commsnnz(i)
 
      cnnz%src = c%src
      cnnz%dst = c%dst
      if (myrank1 == c%src) then
         ! Starting position at source: previous cols plus 1
         cnnz%i1 = sum(m1%numcols(1:(c%i1-1))) + 1
         ! Number of items transmitted: total number of cols
         cnnz%nitems = sum(m1%numcols(c%i1 : c%i1 + c%nitems -1))
      endif
      if (myrank2 == c%dst) then
         ! Starting position at destination: previous cols plus 1
         cnnz%i2 = sum(m2%numcols(1 : (c%i2-1))) + 1
         ! Number of items transmitted: total number of cols
         cnnz%nitems = sum(m2%numcols(c%i2 : c%i2 + c%nitems -1))
      endif
   end do
 
 !!$         do i = 1, size(comms)
 !!$            c => commsnnz(i)
 !!$            if (myrank1 == c%src) then
 !!$               print "(a,i5,a,2i5,2i7,i5)", &
 !!$                 "commnnz(src): ", i, " src, dst, i1, (), n:", &
 !!$                 c%src, c%dst, c%i1, -1, c%nitems
 !!$            endif
 !!$            if (myrank2 == c%dst) then
 !!$               print "(a,i5,a,2i5,2i7,i5)", &
 !!$                 "commnnz(dst): ", i, " src, dst, (), i2, n:", &
 !!$                 c%src, c%dst, -1, c%i2, c%nitems
 !!$            endif
 !!$         enddo
 
   if (myid == 0) print *, "About to transfer cols..."
   ! Transfer the cols arrays
   call do_transfers_int(commsnnz,m1%cols,m2%cols, &
        g1, g2, bridge_comm)
 
   if (myid == 0) print *, "About to transfer values..."
   ! Transfer the values arrays
   do j=1, nvals
      if (proc_in_set1) data1 => m1%vals(j)%data
      if (proc_in_set2) data2 => m2%vals(j)%data
      call do_transfers_dp(commsnnz,data1,data2, &
           g1,g2,bridge_comm)
   enddo
   nullify(data1,data2)
   if (myid == 0) print *, "Done transfers."
 
   deallocate(commsnnz)
   deallocate(comms)
   deallocate(ranks1, ranks2)
 
   call MPI_group_free(gg,ierr)
   call MPI_group_free(g1,ierr)
   call MPI_group_free(g2,ierr)
 
 CONTAINS
 
   
   !-----------------------------------------------------
      subroutine analyze_comms()
   
         integer, allocatable, dimension(:) :: p1, p2, isrc, idst
         integer :: ncomms
   
         ! To turn on debug printing, set this to .true.
         logical, save :: comms_not_printed = .false. 
   
         ! Find the communication needs for each orbital
         ! This information is replicated in every processor
         ! (Note that the indexing functions are able to find
         !  out the information for any processor. For the
         ! block-cyclic and "pexsi" distributions, this is quite
         ! easy. For others, the underlying indexing arrays might
         ! be large...)
   
         ! It might not be necessary to have this in memory. It 
         ! can be done on the fly
         allocate(p1(norbs),p2(norbs),isrc(norbs),idst(norbs))
   
   !      if (myid == 0) then
   !         write(6,"(5a10)") "Orb", "p1", "i1", "p2", "i2"
   !      endif
         do io = 1, norbs
            p1(io) = node_handling_element(dist1,io)
            p2(io) = node_handling_element(dist2,io)
            isrc(io) = index_global_to_local(dist1,io,p1(io))
            idst(io) = index_global_to_local(dist2,io,p2(io))
   !         if (myid == 0) then
   !            if ((norbs < 1000) .or. (mod(io,12) == 0)) then
   !               write(6,"(5i10)") io, p1(io), isrc(io), p2(io), idst(io)
   !            endif
   !        endif
         enddo
   
         ! Aggregate communications
         ! First pass: find out how many there are, on the basis
         ! of groups of orbitals that share the same source and
         ! destination. Due to the form of the distributions, the
         ! local indexes are also correlative in that case, so we
         ! only need to check for p1 and p2. (Check whether this
         ! applies to every possible distribution...)
   
         ncomms = 1
         do io = 2, norbs
            if ((p1(io) /= p1(io-1)) .or. (p2(io) /= p2(io-1))) then
               ncomms = ncomms + 1
            else
               !
            endif
         enddo
   
         allocate(comms(ncomms))
   
         ! Second pass: Fill in the data structures
         ncomms = 1
         c => comms(ncomms)
         io = 1
         c%src = p1(io)
         c%dst = p2(io)
         c%i1  = isrc(io)
         c%i2  = idst(io)
         c%nitems = 1
         do io = 2, norbs
            if ((p1(io) /= p1(io-1)) .or. (p2(io) /= p2(io-1))) then
               ! end of group -- new communication
               ncomms = ncomms + 1
               c => comms(ncomms)
               c%src = p1(io)
               c%dst = p2(io)
               c%i1  = isrc(io)
               c%i2  = idst(io)
               c%nitems = 1
            else
               ! we stay in the same communication
               c%nitems = c%nitems + 1
            endif
         enddo
   
         if (myid == 0 .and. comms_not_printed) then
            do i = 1, ncomms
               c => comms(i)
               write(6,"(a,i5,a,2i5,2i7,i5)") &
                    "comm: ", i, " src, dst, i1, i2, n:", &
                    c%src, c%dst, c%i1, c%i2, c%nitems
            enddo
            comms_not_printed = .false.
         endif
   
         deallocate(p1,p2,isrc,idst)
   
       end subroutine analyze_comms
 
 end subroutine redistribute_spmatrix
 !--------------------------------------------------
    subroutine do_transfers_int(comms,data1,data2,g1,g2,bridge_comm)
 
      use mpi
      type(comm_t), intent(in), target     :: comms(:)
      integer, dimension(:), pointer  :: data1
      integer, dimension(:), pointer  :: data2
      integer, intent(in)                :: g1
      integer, intent(in)                :: g2
      integer, intent(in)                :: bridge_comm
 
      integer                 :: basegroup, nsize1, nsize2, ierr
      integer, allocatable    :: comm_rank1(:), comm_rank2(:)
 
 
      integer :: ncomms
      integer :: i
      integer :: nrecvs_local, nsends_local
      integer, allocatable :: statuses(:,:), local_reqR(:), local_reqS(:)
      integer :: src_in_comm, dst_in_comm
      integer :: myrank1, myrank2, myrank
      type(comm_t), pointer :: c
 
 
       ! Find the rank correspondences, in case
       ! there is implicit renumbering at the time of group creation
 
       call  MPI_Comm_group( bridge_comm, basegroup, ierr )
       call  MPI_Comm_Rank( bridge_comm, myrank, ierr )
 
       call  MPI_Group_Size( g1, nsize1, ierr )
       call  MPI_Group_Size( g2, nsize2, ierr )
 
       allocate(comm_rank1(0:nsize1-1))
       call MPI_Group_translate_ranks( g1, nsize1, (/ (i,i=0,nsize1-1) /), &
                                       basegroup, comm_rank1, ierr )
 !      print "(i4,a,10i3)", myrank, ":Ranks of g1 in base group:", comm_rank1
 
       allocate(comm_rank2(0:nsize2-1))
       call MPI_Group_translate_ranks( g2, nsize2, (/ (i,i=0,nsize2-1) /), &
                                       basegroup, comm_rank2, ierr )
 !      print "(i4,a,10i3)", myrank,":Ranks of g2 in base group:", comm_rank2
 
       call mpi_group_rank(g1,myrank1,ierr)
       call mpi_group_rank(g2,myrank2,ierr)
       
 !      print "(i4,a,2i6)", myrank,": Ranks in g1 and g2: ", myrank1, myrank2
 !      print "(i4,a,2i3)", myrank,": g1 and g2: ", g1, g2
 
 
       ! Do the actual transfers. 
       ! This version with non-blocking communications
 
      ncomms = size(comms)
 
       ! Some bookkeeping for the requests
       nrecvs_local = 0
       nsends_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank2 == c%dst) then
             nrecvs_local = nrecvs_local + 1
          endif
          if (myrank1 == c%src) then
             nsends_local = nsends_local + 1
          endif
       enddo
       allocate(local_reqR(nrecvs_local))
       allocate(local_reqS(nsends_local))
       allocate(statuses(mpi_status_size,nrecvs_local))
 
       ! First, post the receives
       nrecvs_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank2 == c%dst) then
             nrecvs_local = nrecvs_local + 1
             src_in_comm = comm_rank1(c%src)
             call MPI_irecv(data2(c%i2),c%nitems,MPI_integer,src_in_comm, &
                            i,bridge_comm,local_reqR(nrecvs_local),ierr)
          endif
       enddo
 
       ! Post the sends
       nsends_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank1 == c%src) then
             nsends_local = nsends_local + 1
             dst_in_comm = comm_rank2(c%dst)
             call MPI_isend(data1(c%i1),c%nitems,MPI_integer,dst_in_comm, &
                         i,bridge_comm,local_reqS(nsends_local),ierr)
          endif
       enddo
 
       ! A former loop of waits can be substituted by a "waitall",
       ! with every processor keeping track of the actual number of 
       ! requests in which it is involved.
 
       ! Should we wait also on the sends?
 
       call MPI_waitall(nrecvs_local, local_reqR, statuses, ierr)
 
 
       ! This barrier is needed, I think
       call MPI_Barrier(bridge_comm,ierr)
 
       deallocate(local_reqR, local_reqS, statuses)
 
     end subroutine do_transfers_int
 
 !--------------------------------------------------
    subroutine do_transfers_dp(comms,data1,data2,g1,g2,bridge_comm)
 
      use mpi
      integer, parameter :: dp = selected_real_kind(10,100)
 
      type(comm_t), intent(in), target     :: comms(:)
      real(dp), dimension(:), pointer :: data1
      real(dp), dimension(:), pointer :: data2
      integer, intent(in)                :: g1
      integer, intent(in)                :: g2
      integer, intent(in)                :: bridge_comm
 
      integer                 :: basegroup, nsize1, nsize2, ierr
      integer, allocatable    :: comm_rank1(:), comm_rank2(:)
 
 
      integer :: ncomms
      integer :: i
      integer :: nrecvs_local, nsends_local
      integer, allocatable :: statuses(:,:), local_reqR(:), local_reqS(:)
      integer :: src_in_comm, dst_in_comm
      integer :: myrank1, myrank2, myid
      type(comm_t), pointer :: c
 
      call  MPI_Comm_Rank( bridge_comm, myid, ierr )
 !     print *, "Entering transfer_dp"
 !     print *, "rank, Associated data1: ", myid, associated(data1)
 !     print *, "rank, Associated data2: ", myid, associated(data2)
 
       ! Find the rank correspondences, in case
       ! there is implicit renumbering at the time of group creation
 
       call  MPI_Comm_group( bridge_comm, basegroup, ierr )
       call  MPI_Group_Size( g1, nsize1, ierr )
       call  MPI_Group_Size( g2, nsize2, ierr )
       allocate(comm_rank1(0:nsize1-1))
       call MPI_Group_translate_ranks( g1, nsize1, (/ (i,i=0,nsize1-1) /), &
                                       basegroup, comm_rank1, ierr )
 !      print "(a,10i3)", "Ranks of g1 in base group:", comm_rank1
       allocate(comm_rank2(0:nsize2-1))
       call MPI_Group_translate_ranks( g2, nsize2, (/ (i,i=0,nsize2-1) /), &
                                       basegroup, comm_rank2, ierr )
 !      print "(a,10i3)", "Ranks of g2 in base group:", comm_rank2
 
       call mpi_group_rank(g1,myrank1,ierr)
       call mpi_group_rank(g2,myrank2,ierr)
 
       ! Do the actual transfers. 
       ! This version with non-blocking communications
 
      ncomms = size(comms)
 
       ! Some bookkeeping for the requests
       nrecvs_local = 0
       nsends_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank2 == c%dst) then
             nrecvs_local = nrecvs_local + 1
          endif
          if (myrank1 == c%src) then
             nsends_local = nsends_local + 1
          endif
       enddo
       allocate(local_reqR(nrecvs_local))
       allocate(local_reqS(nsends_local))
       allocate(statuses(mpi_status_size,nrecvs_local))
 
       ! First, post the receives
       nrecvs_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank2 == c%dst) then
             nrecvs_local = nrecvs_local + 1
             src_in_comm = comm_rank1(c%src)
             call MPI_irecv(data2(c%i2),c%nitems,MPI_Double_Precision,src_in_comm, &
                            i,bridge_comm,local_reqR(nrecvs_local),ierr)
          endif
       enddo
 
       ! Post the sends
       nsends_local = 0
       do i=1,ncomms
          c => comms(i)
          if (myrank1 == c%src) then
             nsends_local = nsends_local + 1
             dst_in_comm = comm_rank2(c%dst)
             call MPI_isend(data1(c%i1),c%nitems,MPI_Double_Precision,dst_in_comm, &
                         i,bridge_comm,local_reqS(nsends_local),ierr)
          endif
       enddo
 
       ! A former loop of waits can be substituted by a "waitall",
       ! with every processor keeping track of the actual number of 
       ! requests in which it is involved.
 
       ! Should we wait also on the sends?
 
       call MPI_waitall(nrecvs_local, local_reqR, statuses, ierr)
 
 
       ! This barrier is needed, I think
       call MPI_Barrier(bridge_comm,ierr)
 
       deallocate(local_reqR, local_reqS, statuses)
 
     end subroutine do_transfers_dp
#endif
end module m_redist_spmatrix
! --- End of tangled code